Kernels:

kernels-community
/

deep-gemm

Trusted publisher

Kernel card Files Files and versions

xet

Community

danieldk HF Staff commited on Mar 18

Commit

96dea35

verified ·

1 Parent(s): e303cc7

Build uploaded using `kernels` (batch 9/10).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/reduce_split_k.h +232 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce.h +264 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h +374 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h +362 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h +267 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h +248 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h +606 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h +641 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduce.h +234 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduction_operators.h +235 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/threadblock_swizzle.h +67 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/relatively_equal.h +305 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/semaphore.h +118 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/subbyte_reference.h +1388 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_coord.h +326 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref.h +419 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref_planar_complex.h +374 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view.h +297 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view_planar_complex.h +302 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tfloat32.h +479 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/thread/matrix.h +198 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/trace.h +59 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp +754 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp +303 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp +223 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp +603 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp +325 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/pitch_linear_thread_map.h +926 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/transpose.h +107 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/unary_op.h +105 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_iterator.h +199 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h +1350 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h +1315 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h +375 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h +328 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h +2118 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h +834 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h +290 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h +892 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h +1887 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h +787 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h +818 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h +417 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h +253 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h +58 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h +408 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h +587 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h +821 -0
build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h +1532 -0

.gitattributes CHANGED Viewed

@@ -23,3 +23,4 @@ build/torch210-cxx11-cu130-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=
 build/torch29-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu129-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

 build/torch29-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu129-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/reduce_split_k.h ADDED Viewed

	@@ -0,0 +1,232 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+#pragma once
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/kernel/reduce_split_k.h"
+#include "cutlass/cuda_host_adapter.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ReductionKernel_
+>
+class ReduceSplitK {
+public:
+  using ReductionKernel = ReductionKernel_;
+  using Shape = typename ReductionKernel::Shape;
+  using ReductionOp = typename ReductionKernel::ReductionOp;
+  using OutputOp = typename ReductionKernel::OutputOp;
+  using ElementWorkspace = typename ReductionKernel::ElementWorkspace;
+  using ElementAccumulator = typename ReductionKernel::ElementAccumulator;
+  using ElementOutput = typename ReductionKernel::ElementOutput;
+  using WorkspaceTensorRef = typename ReductionKernel::WorkspaceTensorRef;
+  using OutputTensorRef = typename ReductionKernel::OutputTensorRef;
+  using StrideIndex = typename ReductionKernel::StrideIndex;
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+    MatrixCoord problem_size{0,0};
+    int partitions{1};
+    size_t partition_stride{0};
+    WorkspaceTensorRef workspace{};
+    OutputTensorRef destination{};
+    OutputTensorRef source{};
+    typename OutputOp::Params output{};
+    typename ReductionOp::Params reduction{};
+    //
+    // Methods
+    //
+    /// Default ctor
+    Arguments() = default;
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      MatrixCoord const & problem_size
+    ):
+      problem_size(problem_size) { }
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      MatrixCoord problem_size_,
+      int partitions_,
+      size_t partition_stride_,
+      WorkspaceTensorRef workspace_,
+      OutputTensorRef destination_,
+      OutputTensorRef source_,
+      typename OutputOp::Params output_ = typename OutputOp::Params(),
+      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      partitions(partitions_),
+      partition_stride(partition_stride_),
+      workspace(workspace_),
+      destination(destination_),
+      source(source_),
+      output(output_),
+      reduction(reduction_)
+    {
+    }
+  };
+private:
+  /// Kernel parameters object
+  typename ReductionKernel::Params params_;
+public:
+  /// Constructs Reduction SplitK
+  ReduceSplitK() { }
+  /// Determines whether the ReduceSplitK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    // needs no additional workspace
+    return 0;
+  }
+  /// Initializes Reduction state from arguments.
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+    // initialize the params structure from the arguments
+    params_ = typename ReductionKernel::Params(
+      args.problem_size,
+      args.partitions,
+      args.partition_stride,
+      args.workspace,
+      args.destination,
+      args.source,
+      args.output,
+      args.reduction
+    );
+    return Status::kSuccess;
+   }
+  /// Initializes Reduction kernel state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    // update the params structure from the arguments
+    params_.workspace.reset(args.workspace.non_const_ref().data());
+    params_.destination.reset(args.destination.non_const_ref().data());
+    params_.source.reset(args.source.non_const_ref().data());
+    params_.output = args.output;
+    params_.reduction = args.reduction;
+    return Status::kSuccess;
+  }
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    //
+    // Launch reduction kernel
+    //
+    dim3 block = ReductionKernel::block_shape();
+    dim3 grid = ReductionKernel::grid_shape(params_.problem_size);
+    if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params_};
+          cuda_adapter->launch(
+              grid, dim3(1,1,1), block, 0, stream, kernel_params, kernel_index);
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      Kernel<ReductionKernel><<< grid, block, 0, stream >>>(params_);
+    }
+    cudaError_t result = cudaGetLastError();
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
+  }
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    Status status = initialize(args, workspace, stream);
+    if (status == Status::kSuccess) {
+      status = run(stream,cuda_adapter, kernel_index);
+    }
+    return status;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce.h ADDED Viewed

	@@ -0,0 +1,264 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/device/tensor_reduce_affine_strided.h"
+#include "cutlass/reduction/device/tensor_reduce_affine_contiguous.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Tensor reduction operator on specific CUTLASS layouts over exactly one index
+template <
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename Layout_,
+  typename ReductionOp_,
+  int VectorLength_  = 1,
+  typename ElementCompute_ = ElementOutput_
+>
+struct TensorReduction {
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using Layout = Layout_;
+  using ReductionOp = ReductionOp_;
+  static int const kVectorLength = VectorLength_;
+  using ElementCompute = ElementCompute_;
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Reduction operator
+  using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+  using ReductionDeviceContiguousOperator = TensorReductionAffineContiguous<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+  //
+  // Data members
+  //
+  ReductionDeviceStridedOperator reduction_strided;
+  ReductionDeviceContiguousOperator reduction_contiguous;
+  int reduction_index;
+  //
+  // Methods
+  //
+  ///
+  TensorReduction(
+    TensorCoord extent,
+    int reduction_index_
+  ):
+    reduction_index(reduction_index_) {
+    Coord<4> extent_affine;
+    switch (reduction_index) {
+    case 0:
+      extent_affine[0] = extent[1];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[0];
+      extent_affine[3] = extent[3];
+      break;
+    case 1:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[1];
+      extent_affine[3] = extent[3];
+      break;
+    case 2:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    case 3:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    default: break;
+    }
+    if (reduction_index == 3) {
+      reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);
+    }
+    else {
+      reduction_strided = ReductionDeviceStridedOperator(extent_affine);
+    }
+  }
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.good();
+    }
+    return reduction_strided.good();
+  }
+  /// Size of one workspace
+  int64_t workspace_stride() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_stride();
+    }
+    else {
+      return reduction_strided.workspace_stride();
+    }
+  }
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_size();
+    }
+    else {
+      return reduction_strided.workspace_size();
+    }
+  }
+  /// Helper to use overloaded function call operator
+  Status reduce(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+    int64_t src_stride[3];
+    int64_t dst_stride[3];
+    switch (reduction_index) {
+    case 0:
+      src_stride[0] = src_ref.stride()[1];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[2];
+      dst_stride[0] = dst_ref.stride()[1];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 1:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[1];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 2:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      break;
+    case 3:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      dst_stride[2] = dst_ref.stride()[0];
+    default: break;
+    }
+    if (reduction_index == 3) {
+      return reduction_contiguous(
+        dst_ref.data(),
+        dst_stride,
+        src_ref.data(),
+        src_stride,
+        device_workspace_ptr,
+        reduction_identity,
+        reduction_op,
+        stream);
+    }
+    else {
+      return reduction_strided(
+        dst_ref.data(),
+        dst_stride,
+        src_ref.data(),
+        src_stride,
+        device_workspace_ptr,
+        reduction_identity,
+        reduction_op,
+        stream);
+    }
+  }
+  Status operator()(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+    return reduce(
+      dst_ref,
+      src_ref,
+      device_workspace_ptr,
+      reduction_identity,
+      reduction_op,
+      stream);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h ADDED Viewed

	@@ -0,0 +1,374 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (e.g. ND => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguous {
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+  //
+  // Data members
+  //
+  /// Internal status field
+  Status status;
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+  /// Number of points in the outer index space
+  int64_t outer_count;
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+  /// Number of workspaces needed
+  int workspace_count;
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+private:
+  //
+  // Methods
+  //
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+public:
+  /// Default ctor
+  TensorReductionAffineContiguous():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+  /// Constructor
+  TensorReductionAffineContiguous(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+    //
+    // Plan the parallel mapping strategy.
+    //
+    outer_count = 1;
+    inner_count = 1;
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= extent[p];
+    }
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p];
+    }
+    int cta_count_x = 1;
+    int cta_count_y = 1;
+    int cta_count_z = 1;
+    int cta_threads_x = kThreads;
+    int cta_threads_y = 1;
+    int cta_threads_z = 1;
+    // Determine CTA shape
+    int64_t inner_vector_count = inner_count / kVectorLength;
+    // Priority 1. Assign threadblocks to outer indices if possible
+    if (outer_count > target_threadblock_count) {
+      cta_count_x = 1;
+      cta_count_y = target_threadblock_count;
+      cta_count_z = 1;
+    }
+    else {
+      cta_count_y = int(outer_count);
+      int remaining_ctas = target_threadblock_count / cta_count_y;
+      // Priority 2. Assign inner dimensions to one CTA
+      if (inner_vector_count > cta_threads_x) {
+        int64_t cta_z_bound = inner_vector_count / cta_threads_x;
+        if (cta_z_bound > remaining_ctas) {
+          cta_count_z = remaining_ctas;
+        }
+        else {
+          cta_count_z = int(cta_z_bound);
+        }
+      }
+      else {
+        cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
+        cta_count_z = 1;
+      }
+    }
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    threadblock_shape = dim3(cta_threads_x, cta_threads_y, cta_threads_z);
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+    // Determine shape of final reduction kernel if needed
+    if (workspace_count) {
+      int final_threads = kThreads;
+      int final_ctas = 1;
+      if (outer_count > kThreads) {
+        final_ctas = int(outer_count + kThreads - 1) / kThreads;
+      }
+      else {
+        final_threads = int(outer_count);
+      }
+      grid_final = dim3(final_ctas, 1, 1);
+      threadblock_final = dim3(final_threads, 1, 1);
+    }
+    else {
+      grid_final = dim3(0, 0, 0);
+      threadblock_final = dim3(0, 0, 0);
+    }
+  }
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+  /// Size (in bytes) of <outer_count> workspace elements which are densely packed together
+  int64_t workspace_stride() const {
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+    return outer_count * sizeof_bits<ElementCompute>::value / 8;
+  }
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+    return workspace_stride() * grid_shape.z;
+  }
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineContiguous<
+      kRank,
+      kReducedRank,
+      ElementOutput,
+      ElementSource,
+      ReductionOp,
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+    using FinalReductionKernel = kernel::TensorReductionAffineContiguousFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput,
+      ElementSource,
+      ReductionOp,
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+    using Params = typename ReductionKernel::Params;
+    // Construct the parameters
+    Params params(
+      extent,
+      dst_ptr,
+      dst_stride,
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+    // Launch the kernel
+    cutlass::arch::synclog_setup();
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+    // Final reduction kernel
+    if (workspace_count) {
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+    }
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+    return status;
+  }
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+    return reduce(dst_ptr, dst_stride, src_ptr, src_stride, device_workspace_ptr, reduction_identity, reduction_op, stream);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h ADDED Viewed

	@@ -0,0 +1,362 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/kernel/tensor_reduce_affine_strided.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace device {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStrided {
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+  //
+  // Data members
+  //
+  /// Internal status field
+  Status status;
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+  /// Number of points in the outer index space
+  int64_t outer_count;
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+  /// Number of workspaces needed
+  int workspace_count;
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+private:
+  //
+  // Methods
+  //
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+public:
+  /// Default ctor
+  TensorReductionAffineStrided():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+  /// Constructor
+  TensorReductionAffineStrided(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+    //
+    // Plan the parallel mapping strategy.
+    //
+    outer_count = 1;
+    inner_count = 1;
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= extent[p];
+    }
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p - 1];
+    }
+    // Compute plan for the reduction
+    int extent_c = extent[kRank - 1];
+    int vectors_c = (extent_c -1 + kVectorLength) / kVectorLength;
+    // Determine CTA shape
+    int cta_width = kThreads * kVectorLength;
+    int cta_ways = reshape_pow2(extent_c, cta_width);
+    int cta_threads_x = kThreads / cta_ways;
+    threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
+    // This leads to an error.
+    if (threadblock_shape.z > 1) {
+      if (threadblock_shape.y != 1) {
+        status = Status::kErrorInternal;
+        return;
+      }
+    }
+    // Determine grid shape
+    int cta_count_x = (vectors_c + cta_threads_x - 1) / cta_threads_x;
+    int cta_count_y = std::max(1, target_threadblock_count / cta_count_x);
+    // Limit the number of CTAs assigned to outer dimension
+    if (int64_t(cta_count_y * threadblock_shape.y) > outer_count) {
+      cta_count_y = int(outer_count + threadblock_shape.y - 1) / threadblock_shape.y;
+    }
+    // Limit the number of CTAs assigned to inner dimension
+    int cta_count_z = std::max(1, target_threadblock_count / cta_count_y);
+    if (int64_t(cta_count_z * threadblock_shape.z) > inner_count) {
+      cta_count_z = int(inner_count + threadblock_shape.z - 1) / threadblock_shape.z;
+    }
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+    // Determine shape of final reduction kernel if needed
+    grid_final = dim3(cta_count_x, int(outer_count));
+    threadblock_final = dim3(cta_threads_x, 1, 1);
+  }
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+  /// Size of one CTA's workspace
+  int64_t workspace_stride() const {
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+    int vector_size_bytes = kVectorLength * sizeof_bits<ElementCompute>::value / 8;
+    return extent[kRank - 1] * vector_size_bytes;
+  }
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+    return workspace_stride() * outer_count * grid_shape.z;
+  }
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,             ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineStrided<
+      kRank,
+      kReducedRank,
+      ElementOutput,
+      ElementSource,
+      ReductionOp,
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+    using FinalReductionKernel = kernel::TensorReductionAffineStridedFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput,
+      ElementSource,
+      ReductionOp,
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+    using Params = typename ReductionKernel::Params;
+    // Construct the parameters
+    Params params(
+      extent,
+      dst_ptr,
+      dst_stride,
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+    // Launch the kernel
+    cutlass::arch::synclog_setup();
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+    // Final reduction kernel
+    if (workspace_count) {
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+      // Check error condition
+      if (cudaPeekAtLastError() == cudaSuccess) {
+        status = Status::kSuccess;
+      }
+      else {
+        status = Status::kErrorInternal;
+      }
+    }
+    return status;
+  }
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+    return reduce(
+      dst_ptr,
+      dst_stride,
+      src_ptr,
+      src_stride,
+      device_workspace_ptr,
+      reduction_identity,
+      reduction_op,
+      stream);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h ADDED Viewed

	@@ -0,0 +1,267 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a final reduction for softmax
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+template <
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ThreadblockShape_,
+  bool GroupedProblem = false
+>
+class ApplySoftmaxFinalReduction {
+public:
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+  using ThreadblockShape = ThreadblockShape_;
+  static const bool isGroupedProblem = GroupedProblem;
+  //
+  // Arguments
+  //
+  struct Arguments {
+    cutlass::gemm::GemmCoord*  problem_sizes{nullptr};
+    cutlass::gemm::GemmCoord   problem_size{};
+    ElementNorm*               block_Norm{nullptr};
+    ElementSum*                block_Sum{nullptr};
+    int64_t*                   offset_Norm_Device{nullptr};
+    int64_t*                   offset_Sum_Device{nullptr};
+    int64_t                    batch_stride_Max{0};
+    int64_t                    batch_stride_Sum{0};
+    //
+    // Methods
+    //
+    Arguments() { }
+    // Non-grouped constructor without batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+    }
+    // Non-grouped constructor with batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t                   batch_stride_Max,
+      int64_t                   batch_stride_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      batch_stride_Max(batch_stride_Max),
+      batch_stride_Sum(batch_stride_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr)
+    {
+    }
+    // Grouped constructor
+    Arguments(
+      cutlass::gemm::GemmCoord  *problem_sizes,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t*                  offset_Norm_Device,
+      int64_t*                  offset_Sum_Device
+    ):
+      problem_sizes(problem_sizes),
+      problem_size(cutlass::gemm::GemmCoord(0, 0, 0)),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      offset_Norm_Device(offset_Norm_Device),
+      offset_Sum_Device(offset_Sum_Device)
+    {
+    }
+  };
+  struct SharedStorage {
+  };
+  //
+  // Params struct
+  //
+  struct Params {
+    Arguments args;
+    //
+    // Methods
+    //
+    Params() { }
+    Params(Arguments const &args_): args(args_) { }
+  };
+private:
+public:
+  CUTLASS_DEVICE
+  ApplySoftmaxFinalReduction() { }
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    apply(params, shared_storage);
+  }
+private:
+  /// Full reduction
+  CUTLASS_DEVICE
+  void apply(Params const &params, SharedStorage &shared_storage) {
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    int bdim = blockDim.x;
+    int block_batch = blockIdx.z;
+    // defining three vars for a general reduction module
+    cutlass::gemm::GemmCoord problem_size = isGroupedProblem ? params.args.problem_sizes[bid] : params.args.problem_size;
+    int m_dim_in_loop = isGroupedProblem ? problem_size.m() : tid + bdim;
+    int access_offset = isGroupedProblem ? 0 : bid * bdim;
+    if (!isGroupedProblem && access_offset + tid >= problem_size.m()) return;
+    ElementNorm *curr_ptr_Max = isGroupedProblem ? \
+              params.args.block_Norm + params.args.offset_Norm_Device[bid] : \
+              params.args.block_Norm + block_batch * params.args.batch_stride_Max;
+    ElementSum *curr_ptr_Sum = isGroupedProblem ? \
+              params.args.block_Sum + params.args.offset_Sum_Device[bid] : \
+              params.args.block_Sum + block_batch * params.args.batch_stride_Sum;
+    int threadblock_num = (problem_size.n() + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+    using ConvertSum = cutlass::NumericConverter<ElementSoftmaxCompute, ElementSum>;
+    using ConvertNorm = cutlass::NumericConverter<ElementSoftmaxCompute, ElementNorm>;
+    ConvertSum   convert_sum;
+    ConvertNorm  convert_norm;
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+    uint32_t float_max_bits = 0xff7fffff;
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx_m = tid; idx_m < m_dim_in_loop; idx_m += bdim) {
+      ElementNorm *access_n = curr_ptr_Max + idx_m + access_offset;
+      ElementSum *access_s = curr_ptr_Sum + idx_m + access_offset;
+      ElementNorm *access_n_bak = access_n;
+      ElementSum *access_s_bak = access_s;
+      ElementSoftmaxCompute max_val = ElementSoftmaxCompute(min_float);
+      ElementSoftmaxCompute sum_val = ElementSoftmaxCompute(0);
+      ElementNorm fetch_n;
+      ElementSum fetch_s;
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        max_val = cutlass::fast_max(max_val, convert_norm(fetch_n));
+        access_n += problem_size.m();
+      }
+      access_n = access_n_bak;
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        cutlass::arch::global_load<ElementSum, sizeof(ElementSum)>(fetch_s, access_s, true);
+        sum_val += convert_sum(fetch_s) * cutlass::fast_exp(convert_norm(fetch_n) - max_val);
+        access_n += problem_size.m();
+        access_s += problem_size.m();
+      }
+      ElementSoftmaxCompute inv_sum = cutlass::constants::one<ElementSoftmaxCompute>() / sum_val;
+      access_n = access_n_bak;
+      access_s = access_s_bak;
+      access_n[0] = convert_norm_output(max_val);
+      access_s[0] = convert_sum_output(inv_sum);
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h ADDED Viewed

	@@ -0,0 +1,248 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/layout/matrix.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,              ///< shape of CTA        (concept: MatrixShape)
+  typename OutputOp_ ,          ///< output operator     (concept: epilogue::thread operator)
+  typename ReductionOp_,        ///< reduction operator  (concept: ReductionOperator)
+  int PartitionsPerStage = 4    ///< number of partitions to issue
+>
+class ReduceSplitK {
+public:
+  using Shape = Shape_;
+  using ReductionOp = ReductionOp_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = OutputOp::kCount;
+  static int const kPartitionsPerStage = PartitionsPerStage;
+  using ElementWorkspace = typename ReductionOp::Element;
+  using ElementAccumulator = typename ReductionOp::ElementAccumulator;
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using WorkspaceTensorRef = TensorRef<ElementWorkspace, layout::RowMajor>;
+  using OutputTensorRef = TensorRef<ElementOutput, layout::RowMajor>;
+  using StrideIndex = typename WorkspaceTensorRef::Layout::Stride::Index;
+  using FragmentWorkspace = AlignedArray<ElementWorkspace, kElementsPerAccess>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentOutput = AlignedArray<ElementOutput, kElementsPerAccess>;
+  //
+  // Types
+  //
+  /// Params structure
+  struct Params {
+    MatrixCoord problem_size;
+    int partitions;
+    size_t partition_stride;
+    WorkspaceTensorRef workspace;
+    OutputTensorRef destination;
+    OutputTensorRef source;
+    typename OutputOp::Params output;
+    typename ReductionOp::Params reduction;
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    CUTLASS_HOST_DEVICE
+    Params(
+      MatrixCoord problem_size_,
+      int partitions_,
+      size_t partition_stride_,
+      WorkspaceTensorRef workspace_,
+      OutputTensorRef destination_,
+      OutputTensorRef source_,
+      typename OutputOp::Params output_ = typename OutputOp::Params(),
+      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      partitions(partitions_),
+      partition_stride(sizeof(FragmentWorkspace) * partition_stride_ / kElementsPerAccess),
+      workspace(workspace_),
+      destination(destination_),
+      source(source_),
+      output(output_),
+      reduction(reduction_) {
+    }
+  };
+  struct SharedStorage { };
+public:
+  /// Computes the grid size given a chosen threadblock shape
+  CUTLASS_HOST_DEVICE
+  static dim3 grid_shape(
+    cutlass::MatrixCoord problem_size) {
+    return dim3(
+      (problem_size.row() + Shape::kRow - 1) / Shape::kRow,
+      (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn);
+  }
+  /// Determines the threadblock shape
+  CUTLASS_HOST_DEVICE
+  static dim3 block_shape() {
+    return dim3(Shape::kColumn / kElementsPerAccess, Shape::kRow);
+  }
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &storage) {
+    // Determine CTA position
+    MatrixCoord thread_offset(
+      MatrixCoord::Index(int(blockIdx.x) * Shape::kRow + threadIdx.y),
+      MatrixCoord::Index(int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess)
+    );
+    // One guard conditional
+    if (!(thread_offset.row() < params.problem_size.row() &&
+          thread_offset.column() < params.problem_size.column())) {
+      return;
+    }
+    ReductionOp reduction_op(params.reduction);
+    FragmentAccumulator accumulator;
+    accumulator.clear();
+    //
+    // Load the first slice
+    //
+    char const *workspace_ptr =
+      reinterpret_cast<char const *>(
+        params.workspace.data() + params.workspace.offset(thread_offset));
+    FragmentWorkspace workspace_frag[kPartitionsPerStage];
+    //
+    // Construct the output operator
+    //
+    OutputOp output_op(params.output);
+    //
+    // Load and accumulate with a simple batched loading sequence.
+    //
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int k = 0; k < params.partitions; k += kPartitionsPerStage) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kPartitionsPerStage; ++i) {
+        if (k + i < params.partitions) {
+          workspace_frag[i] = *reinterpret_cast<FragmentWorkspace const *>(workspace_ptr);
+          workspace_ptr += params.partition_stride;
+        }
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kPartitionsPerStage; ++i) {
+        if (k + i < params.partitions) {
+          accumulator = reduction_op(accumulator, workspace_frag[i]);
+        }
+      }
+    }
+    //
+    // Conditionally load the source
+    //
+    FragmentOutput source_frag;
+    source_frag.clear();
+    FragmentOutput const *source_ptr = reinterpret_cast<FragmentOutput const *>(
+      params.source.data() + params.source.offset(thread_offset));
+    if (output_op.is_source_needed()) {
+      reinterpret_cast<FragmentOutput &>(source_frag) = *source_ptr;
+    }
+    //
+    // Compute the output
+    //
+    typename OutputOp::FragmentOutput output_frag = output_op(accumulator, source_frag);
+    //
+    // Store
+    //
+    FragmentOutput *dest_ptr = reinterpret_cast<FragmentOutput *>(
+      params.destination.data() + params.destination.offset(thread_offset));
+    *dest_ptr = reinterpret_cast<FragmentOutput const &>(output_frag);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h ADDED Viewed

	@@ -0,0 +1,606 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (i.e. number of outer ranks)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguousParams {
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank];             /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int workspace_count;                          /// number of workspaces
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element used by reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+  //
+  // Methods
+  //
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineContiguousParams() {
+  }
+  /// Ctor
+  TensorReductionAffineContiguousParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_ = ElementCompute() ///< Identity element used by reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kRank - 1 - p]);
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kernel to reduce a tensor with affine layout over a set of ranks *INCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguous {
+public:
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+private:
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params,
+    Coord<kInnerRank> & coord,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+    // Decompose into a coordinate of rank <kInnerRank>
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kRank - kInnerRank]);
+    // Compute an offset using the souce stride
+    src_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank - 1; ++i) {
+      src_offset += coord[i] * params.src_stride[kReducedRank + i];
+    }
+    src_offset += coord[kInnerRank - 1] * sizeof_bits<ElementSource>::value / 8;
+  }
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params,
+    Coord<kReducedRank> & coord,
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+    src_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+  /// Reduces over the reduction indices yielding a single element
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr,
+    int coord_c) {
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+    //
+    // Early exit or initialize to identity element
+    //
+    if (!params.inner_count) {
+      return params.reduction_identity;
+    }
+    ComputeFragment accumulator;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(accumulator.size()); ++i) {
+      accumulator[i] = params.reduction_identity;
+    }
+    // Compute the coordinate of the first access
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord;
+    uint64_t linear_idx = (threadIdx.x + blockDim.x * threadIdx.z + blockDim.x * blockIdx.z * blockDim.z) * kVectorLength;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    bool not_done = true;
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+      bool guards[kBatchSize];
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+          auto cvt = convert_source(source_fragment[b]);
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op,
+            accumulator,
+            cvt);
+        }
+      }
+    };
+    //
+    // Reduction of vectors to scalar
+    //
+    ElementCompute reduced_accumulator = accumulator[0];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kVectorLength; ++i) {
+      reduced_accumulator = reduction_op(reduced_accumulator, accumulator[i]);
+    }
+    //
+    // Reduction within CTA across threadIdx.xz => threadIdx{.x = 0, .z = 0}
+    //
+    // This re-arranges data so threadIdx.y is effectively a row index and threadIdx.xz is a column
+    //
+    int thread_count = blockDim.x * blockDim.z;
+    int thread_j = threadIdx.x + blockDim.x * threadIdx.z;
+    int thread_i = threadIdx.y;
+    ElementCompute *frag_ptr = reinterpret_cast<ElementCompute *>(threadblock_workspace) + thread_i * thread_count;
+    frag_ptr[thread_j] = reduced_accumulator;
+    //
+    // Reduce
+    //
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (thread_count > 1) {
+      thread_count /= 2;
+      __syncthreads();
+      if (thread_j < thread_count) {
+        ElementCompute other = frag_ptr[thread_j + thread_count];
+        reduced_accumulator = reduction_op(reduced_accumulator, other);
+        frag_ptr[thread_j] = reduced_accumulator;
+      }
+      __syncthreads();
+    }
+    return reduced_accumulator;
+  }
+public:
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source);
+    char * dst_byte_ptr = nullptr;
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace);
+    }
+    uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+    compute_outer_coord_and_offset_(
+      params,
+      outer_coord,
+      dst_byte_offset,
+      src_byte_offset,
+      idx_linear);
+    if (gridDim.z == 1) {
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+        ElementCompute result = reduce_indices_(
+          params,
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+          // Convert to output type and store
+          NumericConverter<ElementOutput, ElementCompute> convert_output;
+          ElementOutput cvt = convert_output(result);
+          *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = cvt;
+        }
+        __syncthreads();
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+        compute_outer_coord_and_offset_(
+          params,
+          outer_coord,
+          dst_byte_offset,
+          src_byte_offset,
+          idx_linear);
+      } // while
+    }
+    else {
+      /// Complete the reduction with workspace
+      while (idx_linear < params.outer_count) {
+        ElementCompute result = reduce_indices_(
+          params,
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+        int64_t byte_offset =
+          blockIdx.z * params.workspace_stride + idx_linear * sizeof_bits<ElementCompute>::value / 8;
+        // Store the result for final reduction
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+          *reinterpret_cast<ElementCompute *>(dst_byte_ptr + byte_offset) = result;
+        }
+        __syncthreads();
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+        compute_outer_coord_and_offset_(
+          params,
+          outer_coord,
+          dst_byte_offset,
+          src_byte_offset,
+          idx_linear);
+      } // while
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguousFinal {
+public:
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  /// Shared memory
+  struct SharedStorage { };
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+private:
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params,
+    Coord<kReducedRank> & coord,
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute const *device_workspace) {
+    ReductionOp reduction_op(params.reduction_op);
+    char const *src_byte_ptr = reinterpret_cast<char const *>(device_workspace);
+    // Accumulated output
+    ElementCompute accumulator = params.reduction_identity;
+    for (int iter = 0; iter < params.workspace_count; ++iter) {
+      ElementCompute workspace_item = *reinterpret_cast<ElementCompute const *>(src_byte_ptr);
+      accumulator = reduction_op(accumulator, workspace_item);
+      src_byte_ptr += params.workspace_stride;
+    }
+    return accumulator;
+  }
+public:
+  //
+  // Methods
+  //
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    uint64_t idx_linear = blockIdx.x * blockDim.x + threadIdx.x;
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+    compute_outer_coord_and_offset_(
+      params,
+      outer_coord,
+      dst_byte_offset,
+      idx_linear);
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+      ElementCompute result = reduce_indices_(params, params.device_workspace + idx_linear);
+      // Convert to output type and store
+      NumericConverter<ElementOutput, ElementCompute> convert_output;
+      *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = convert_output(result);
+      // Update indices and pointers
+      idx_linear += gridDim.x * blockDim.x;
+      compute_outer_coord_and_offset_(
+        params,
+        outer_coord,
+        dst_byte_offset,
+        idx_linear);
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h ADDED Viewed

	@@ -0,0 +1,641 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace kernel {
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStridedParams {
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank - 1];         /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int64_t workspace_outer_stride;               /// stride (units of bytes) between 'rows' of the workspace
+  int workspace_count;                          /// number of workspaces
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element for reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+  //
+  // Methods
+  //
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineStridedParams() {
+  }
+  /// Ctor
+  TensorReductionAffineStridedParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_  = ElementCompute() ///< Identity element for reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_outer_stride(0),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+    workspace_outer_stride = workspace_stride * workspace_count;
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kReducedRank + p - 1]);
+    }
+  }
+};
+/// Kernel to reduce a tensor with affine layout over a set of ranks *EXCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStrided {
+public:
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+private:
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params,
+    Coord<kInnerRank> & coord,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+    // Decompose into coordinate
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kReducedRank - 1]);
+    // Compute linear offset
+    src_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank; ++i) {
+      src_offset += params.src_stride[kReducedRank + i - 1] * coord[i];
+    }
+  }
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params,
+    Coord<kReducedRank - 1> & coord,
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+    // Decompose linear coordinate
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+    // Compute offset into tensors
+    dst_offset = 0;
+    src_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr) {
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+    // Accumulated output
+    ComputeFragment identity_frag;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+    if (!params.inner_count) {
+      return identity_frag;
+    }
+    ComputeFragment accumulator = identity_frag;
+    // Compute the coordinate of the first access
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord;
+    uint64_t linear_idx = threadIdx.z + blockIdx.z * blockDim.z;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    bool not_done = true;
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+      bool guards[kBatchSize];
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+        linear_idx += blockDim.z * gridDim.z;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+          auto cvt = convert_source(source_fragment[b]);
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op,
+             accumulator,
+             cvt);
+        }
+      }
+    };
+    // Optional reduction within a CTA
+    if (blockDim.z > 1) {
+      // Linearized thread ID
+      int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+      // all threads store to workspace
+      ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);
+      frag_ptr[thread_idx] = accumulator;
+      __syncthreads();
+      if (threadIdx.z == 0) {
+        // Load all additional block indices
+        for (int z = 1; z < blockDim.z; ++z) {
+          ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op,
+            accumulator,
+            frag);
+        }
+      }
+      __syncthreads();
+    }
+    return accumulator;
+  }
+public:
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source + coord_c);
+    char * dst_byte_ptr = nullptr;
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    }
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+    compute_outer_coord_and_offset_(
+      params,
+      outer_coord,
+      dst_byte_offset,
+      src_byte_offset,
+      idx_linear);
+    if (gridDim.z == 1) {
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+        ComputeFragment result;
+        result = reduce_indices_(
+          params,
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+          // Convert to output type and store
+          NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+          auto cvt = convert_output(result);
+          *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) =
+            reinterpret_cast<OutputFragment const &>(cvt);
+        }
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+        compute_outer_coord_and_offset_(
+          params,
+          outer_coord,
+          dst_byte_offset,
+          src_byte_offset,
+          idx_linear);
+      } // while
+    }
+    else {
+      /// Complete the reduction with a device workspace
+      while (idx_linear < params.outer_count) {
+        ComputeFragment result;
+        result = reduce_indices_(
+          params,
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+          int64_t byte_offset =
+            blockIdx.z * params.workspace_stride + idx_linear * params.workspace_outer_stride;
+          // No conversion - store in compute type
+          *reinterpret_cast<ComputeFragment *>(dst_byte_ptr + byte_offset) =
+            reinterpret_cast<ComputeFragment const &>(result);
+        }
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+        compute_outer_coord_and_offset_(
+          params,
+          outer_coord,
+          dst_byte_offset,
+          src_byte_offset,
+          idx_linear);
+      } // while (outer index)
+    } // if ()
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStridedFinal {
+public:
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+  /// Shared memory
+  struct SharedStorage { };
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+private:
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params,
+    Coord<kReducedRank - 1> & coord,
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+    // Decompose linear index
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+    // Compute tensor offset
+    dst_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    char *src_byte_ptr) {
+    ReductionOp reduction_op(params.reduction_op);
+    // Accumulated output
+    ComputeFragment identity_frag;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+    ComputeFragment accumulator = identity_frag;
+    ComputeFragment workspace_fragments[kBatchSize];
+    // Partially unrolled loop
+    for (int idx = 0; idx < params.workspace_count; idx += kBatchSize) {
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (idx + b < params.workspace_count) {
+          workspace_fragments[b] =
+            *reinterpret_cast<ComputeFragment *>(src_byte_ptr);
+        }
+        else {
+          workspace_fragments[b] = identity_frag;
+        }
+        src_byte_ptr += + params.workspace_stride;
+      }
+      // Perform a reduction
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kVectorLength; ++i) {
+          accumulator[i] = reduction_op(accumulator[i], workspace_fragments[b][i]);
+        }
+      }
+    }
+    return accumulator;
+  }
+public:
+  //
+  // Methods
+  //
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+    char * src_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+    compute_outer_coord_and_offset_(
+      params,
+      outer_coord,
+      dst_byte_offset,
+      idx_linear);
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+      int64_t src_byte_offset = idx_linear * params.workspace_outer_stride;
+      ComputeFragment result = reduce_indices_(
+        params,
+        src_byte_ptr + src_byte_offset);
+      // Convert to output type and store
+      NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+      auto cvt = convert_output(result);
+      *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) =
+        reinterpret_cast<OutputFragment const &>(cvt);
+      // Update indices and pointers
+      idx_linear += gridDim.y * blockDim.y;
+      compute_outer_coord_and_offset_(
+        params,
+        outer_coord,
+        dst_byte_offset,
+        idx_linear);
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduce.h ADDED Viewed

	@@ -0,0 +1,234 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic thread level reduction with specializations for Array<T, N>.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+namespace cutlass {
+namespace reduction {
+namespace thread {
+/// Structure to compute the thread level reduction
+template <typename Op, typename T>
+struct Reduce;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial Specialization of Reduce for "plus" (a functional operator)
+template <typename T>
+struct Reduce< plus<T>, T > {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    plus<T> _op;
+    return _op(lhs, rhs);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization of Reduce for Array<T, N>
+template <typename T, int N>
+struct Reduce < plus<T>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, 1> operator()(Array<T, N> const &in) const {
+    Array<T, 1> result;
+    Reduce< plus<T>, T > scalar_reduce;
+    result.clear();
+    CUTLASS_PRAGMA_UNROLL
+    for (auto i = 0; i < N; ++i) {
+      result[0] = scalar_reduce(result[0], in[i]);
+    }
+    return result;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specializations of Reduce for Array<half_t, N>
+template <int N>
+struct Reduce < plus<half_t>, Array<half_t, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, 1> operator()(Array<half_t, N> const &input) {
+    Array<half_t, 1> result;
+    // If there is only 1 element - there is nothing to reduce
+    if( N ==1 ){
+      result[0] = input.front();
+    } else {
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+        __half result_d;
+        Array<half_t, 1> const *in_ptr_half = reinterpret_cast<Array<half_t, 1> const *>(&input);
+        Array<half_t, 2> const *in_ptr_half2 = reinterpret_cast<Array<half_t, 2> const *>(&input);
+        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
+        // Set initial result = first half2, in case N==2
+        __half2 tmp_result = x_in_half2[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < N/2; ++i) {
+          tmp_result = __hadd2(x_in_half2[i], tmp_result);
+        }
+        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
+        // One final step is needed for odd "N" (to add the (N-1)th element)
+        if( N%2 ){
+          __half last_element;
+          Array<half_t, 1> tmp_last;
+          Array<half_t, 1> *tmp_last_ptr = &tmp_last;
+          tmp_last_ptr[0] = in_ptr_half[N-1];
+          last_element = reinterpret_cast<__half  const &>(tmp_last);
+          result_d = __hadd(result_d, last_element);
+        }
+        Array<half_t, 1> *result_ptr = &result;
+        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
+      #else
+        Reduce< plus<half_t>, half_t > scalar_reduce;
+        result.clear();
+        CUTLASS_PRAGMA_UNROLL
+        for (auto i = 0; i < N; ++i) {
+          result[0] = scalar_reduce(result[0], input[i]);
+        }
+      #endif
+    }
+    return result;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specializations of Reduce for AlignedArray<half_t, N>
+template <int N>
+struct Reduce < plus<half_t>, AlignedArray<half_t, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, 1> operator()(AlignedArray<half_t, N> const &input) {
+    Array<half_t, 1> result;
+    // If there is only 1 element - there is nothing to reduce
+    if( N ==1 ){
+      result[0] = input.front();
+    } else {
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+        __half result_d;
+        AlignedArray<half_t, 1> const *in_ptr_half = reinterpret_cast<AlignedArray<half_t, 1> const *>(&input);
+        AlignedArray<half_t, 2> const *in_ptr_half2 = reinterpret_cast<AlignedArray<half_t, 2> const *>(&input);
+        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
+        // Set initial result = first half2, in case N==2
+        __half2 tmp_result = x_in_half2[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < N/2; ++i) {
+          tmp_result = __hadd2(x_in_half2[i], tmp_result);
+        }
+        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
+        // One final step is needed for odd "N" (to add the (N-1)th element)
+        if( N%2 ){
+          __half last_element;
+          AlignedArray<half_t, 1> tmp_last;
+          AlignedArray<half_t, 1> *tmp_last_ptr = &tmp_last;
+          tmp_last_ptr[0] = in_ptr_half[N-1];
+          last_element = reinterpret_cast<__half  const &>(tmp_last);
+          result_d = __hadd(result_d, last_element);
+        }
+        Array<half_t, 1> *result_ptr = &result;
+        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
+      #else
+        Reduce< plus<half_t>, half_t > scalar_reduce;
+        result.clear();
+        CUTLASS_PRAGMA_UNROLL
+        for (auto i = 0; i < N; ++i) {
+          result[0] = scalar_reduce(result[0], input[i]);
+        }
+      #endif
+    }
+    return result;
+  }
+};
+}
+}
+}

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduction_operators.h ADDED Viewed

	@@ -0,0 +1,235 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace reduction {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Mixed-precision reduction
+template <
+  typename ElementAccumulator_,
+  typename Element_,
+  int Count = 1
+>
+struct ReduceAdd {
+  //
+  // Type definitions
+  //
+  using ElementAccumulator = ElementAccumulator_;
+  using Element = Element_;
+  static int const kCount = Count;
+  using FragmentAccumulator = cutlass::Array<ElementAccumulator, kCount>;
+  using FragmentElement = cutlass::Array<Element, kCount>;
+  struct Params { };
+  //
+  // Data members
+  //
+  /// Parameters object
+  Params params;
+  //
+  // Methods
+  //
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ReduceAdd(Params params_ = Params()): params(params_) { }
+  /// Operator
+  CUTLASS_HOST_DEVICE
+  FragmentAccumulator operator()(
+    FragmentAccumulator accumulator,
+    FragmentElement element) const {
+    plus<FragmentAccumulator> op;
+    NumericArrayConverter<
+      ElementAccumulator,
+      Element,
+      kCount,
+      PreferredRoundingMode<ElementAccumulator, Element>::kRound> converter;
+    return op(accumulator, converter(element));
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+/// Special handling for binary operators
+template <typename ReductionOp, typename Element, int N>
+struct VectorizeArrayOperation {
+  using ValueType = Array<Element, N>;
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(
+    ReductionOp const &reduction_op,
+    ValueType const &lhs,
+    ValueType const &rhs) const {
+    ValueType result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = reduction_op(lhs[i], rhs[i]);
+    }
+    return result;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ReductionOp, typename Element, int N>
+struct ReduceArrayOperation {
+  using ArrayType = Array<Element, N>;
+  CUTLASS_HOST_DEVICE
+  Element operator()(
+    ReductionOp const &reduction_op,
+    ArrayType const &array) const {
+    Element item = reduction_op(array[0], array[1]);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 2; i < N; ++i) {
+      item = reduction_op(item, array[i]);
+    }
+    return item;
+  }
+};
+template <int N>
+struct ReduceArrayOperation<logical_and<uint1b_t>, uint1b_t, N> {
+  using ArrayType = Array<uint1b_t, N>;
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op,
+    ArrayType const &array) const {
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = false;
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || !bits);
+    }
+    return uint1b_t{!item};
+  }
+};
+template <int N>
+struct ReduceArrayOperation<logical_or<uint1b_t>, uint1b_t, N> {
+  using ArrayType = Array<uint1b_t, N>;
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op,
+    ArrayType const &array) const {
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = true;
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || bits);
+    }
+    return uint1b_t{item};
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper function to infer template argument types
+template <typename ReductionOp, typename Element, int N>
+CUTLASS_HOST_DEVICE
+Array<Element, N> ApplyArrayOperator(
+  ReductionOp const &reduction_op,
+  Array<Element, N> const &lhs,
+  Array<Element, N> const &rhs) {
+  VectorizeArrayOperation<ReductionOp, Element, N> vectorize_op;
+  return vectorize_op(reduction_op, lhs, rhs);
+}
+/// Helper to reduce an array
+template <typename ReductionOp, typename Element, int N>
+Element ReduceArray(ReductionOp const &reduction_op, Array<Element, N> const &array) {
+  ReduceArrayOperation<ReductionOp, Element, N> reduce_array_op;
+  return reduce_array_op(reduction_op, array);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace reduction
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/threadblock_swizzle.h ADDED Viewed

	@@ -0,0 +1,67 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+**************************************************************************************************/
+/*! \file
+\brief Defies functors for mapping blockIdx to partitions of the batched reduction computation.
+*/
+#pragma once
+#include "cutlass/coord.h"
+namespace cutlass {
+namespace reduction {
+struct DefaultBlockSwizzle {
+  /// Ctor
+  CUTLASS_HOST_DEVICE DefaultBlockSwizzle() {}
+  /// Swizzle the block index.
+  CUTLASS_DEVICE dim3 swizzle() { return blockIdx; }
+  ///
+  CUTLASS_HOST_DEVICE dim3 get_grid_layout(Coord<3> const &problem_size,
+                                           Coord<3> const &OutputTile) {
+    assert(OutputTile[0] == 1 && OutputTile[1] == 1);
+    assert((problem_size[0] * problem_size[1] * problem_size[2]) % OutputTile[2] == 0);
+    dim3 grid;
+    grid.x = problem_size[0] * problem_size[1] * problem_size[2]
+      / OutputTile[2] ;
+    return grid;
+  }
+  ///
+  CUTLASS_DEVICE Coord<3> get_threadblock_offset(Coord<3> const &SubTile) {
+    assert(SubTile[0] == 1 && SubTile[1] == 1);
+    dim3 block = swizzle();
+    Coord<3> threadblock_offset =
+      make_Coord(0, 0, block.x * SubTile[2]);
+    return threadblock_offset;
+  }
+};
+} // namespace reduction
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/relatively_equal.h ADDED Viewed

	@@ -0,0 +1,305 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Performs comparison between two elements with support for floating-point comparisons.
+*/
+#pragma once
+#include "numeric_types.h"
+#include "complex.h"
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, typename U = T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(T a, T b, U epsilon, U nonzero_floor);
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+// This floating-point comparison function implements the method described in
+//
+// https://floating-point-gui.de/errors/comparison/
+//
+template <typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  T diff = abs(a - b);
+  T zero = T(0);
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || (abs_A + abs_B) < nonzero_floor) {
+    return diff < epsilon * nonzero_floor;
+  }
+  return diff < epsilon * (abs_A + abs_B);
+}
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bool>(bool a, bool b, bool, bool) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint1b_t>(uint1b_t a, uint1b_t b, uint1b_t, uint1b_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int2b_t>(int2b_t a, int2b_t b, int2b_t, int2b_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint2b_t>(uint2b_t a, uint2b_t b, uint2b_t, uint2b_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int4b_t>(int4b_t a, int4b_t b, int4b_t, int4b_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint4b_t>(uint4b_t a, uint4b_t b, uint4b_t, uint4b_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int8_t>(int8_t a, int8_t b, int8_t, int8_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint8_t>(uint8_t a, uint8_t b, uint8_t, uint8_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int16_t>(int16_t a, int16_t b, int16_t, int16_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint16_t>(uint16_t a, uint16_t b, uint16_t, uint16_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int32_t>(int32_t a, int32_t b, int32_t, int32_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint32_t>(uint32_t a, uint32_t b, uint32_t, uint32_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int64_t>(int64_t a, int64_t b, int64_t, int64_t) {
+  return (a == b);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint64_t>(uint64_t a, uint64_t b, uint64_t, uint64_t) {
+  return (a == b);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e4m3_t>(float_e4m3_t a, float_e4m3_t b, float_e4m3_t epsilon, float_e4m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e5m2_t>(float_e5m2_t a, float_e5m2_t b, float_e5m2_t epsilon, float_e5m2_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<half_t>(half_t a, half_t b, half_t epsilon, half_t nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bfloat16_t>(
+  bfloat16_t a,
+  bfloat16_t b,
+  bfloat16_t epsilon,
+  bfloat16_t nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<tfloat32_t>(
+  tfloat32_t a,
+  tfloat32_t b,
+  tfloat32_t epsilon,
+  tfloat32_t nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float>(float a, float b, float epsilon, float nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<double>(double a, double b, double epsilon, double nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+template<typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(complex<T> a, complex<T> b, T epsilon, T nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  T diff = abs(a - b);
+  complex<T> zero = complex<T>{T{}, T{}};
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || diff < nonzero_floor) {
+    return diff < epsilon * nonzero_floor;
+  }
+  return diff < epsilon * (abs_A + abs_B);
+}
+template <typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(complex<T> a,  complex<T> b, complex<T> epsilon, complex<T> nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  complex<T> diff = a - b;
+  T abs_diff = abs(diff);
+  complex<T> zero = complex<T>{T{}, T{}};
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || abs_diff < abs(nonzero_floor)) {
+    return abs_diff < abs(epsilon * nonzero_floor);
+  }
+  return abs_diff < abs(epsilon) * (abs_A + abs_B);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e2m3_t>(float_e2m3_t a, float_e2m3_t b, float_e2m3_t epsilon, float_e2m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e3m2_t>(float_e3m2_t a, float_e3m2_t b, float_e3m2_t epsilon, float_e3m2_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e2m1_t>(float_e2m1_t a, float_e2m1_t b, float_e2m1_t epsilon, float_e2m1_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_ue8m0_t>(float_ue8m0_t a, float_ue8m0_t b, float_ue8m0_t epsilon, float_ue8m0_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_ue4m3_t>(float_ue4m3_t a, float_ue4m3_t b, float_ue4m3_t epsilon, float_ue4m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/semaphore.h ADDED Viewed

	@@ -0,0 +1,118 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide semaphore for inter-CTA synchronization.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// CTA-wide semaphore for inter-CTA synchronization.
+class Semaphore {
+public:
+  int *lock;
+  bool wait_thread;
+  int state;
+public:
+  /// Implements a semaphore to wait for a flag to reach a given value
+  CUTLASS_HOST_DEVICE
+  Semaphore(int *lock_, int thread_id):
+    lock(lock_),
+    wait_thread(thread_id < 0 || thread_id == 0),
+    state(-1) {
+  }
+  /// Permit fetching the synchronization mechanism early
+  CUTLASS_DEVICE
+  void fetch() {
+    if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+      #else
+      asm volatile ("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+      #endif
+    }
+  }
+  /// Gets the internal state
+  CUTLASS_DEVICE
+  int get_state() const {
+    return state;
+  }
+  /// Waits until the semaphore is equal to the given value
+  CUTLASS_DEVICE
+  void wait(int status = 0) {
+    while( __syncthreads_and(state != status) ) {
+      fetch();
+    }
+    __syncthreads();
+  }
+  /// Updates the lock with the given result
+  CUTLASS_DEVICE
+  void release(int status = 0) {
+    __syncthreads();
+    if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #else
+      asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #endif
+    }
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/subbyte_reference.h ADDED Viewed

	@@ -0,0 +1,1388 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Provides a mechanism for packing and unpacking elements smaller than one byte
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/fast_math.h"
+namespace cutlass {
+namespace detail {
+// This is an implementation detail of cutlass::SubbyteReference and.
+// cutlass::HostTensor.  For a given logical element type Element,
+// and its corresponding storage (physical) element type StorageUnit,
+// it computes quantities that help with managing allocations.
+//
+// CUTLASS uses a hidden "ContainerUnitType" or StorageUnit type to support
+// packed arrays of subbyte types such as int4.  Element is the "logical" type
+// for computations, while CUTLASS uses StorageUnit as the element type
+// of a packed array of Element.  If Element is not a subbyte type,
+// then the corresponding StorageUnit type is just Element itself.
+//
+// The ContainerType is always calculated as an array StorageUnit type (the StorageUnit
+// is always a byte for subbyte types),
+// and its number of bits is the lcm of the subbyte type's number of bits and 8.
+// Below are some examples for different subbyte types.
+//
+// * Subbyte Type=int2, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+// * Subbyte Type=int4, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+template<class Element, class StorageUnit>
+struct StorageContainerCalculator {
+  // kContainerTypeNumBits: The number of bits needed for ContainerType
+  static constexpr int kContainerTypeNumBits   = (sizeof_bits<Element>::value < 8) ? cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value) : sizeof_bits<Element>::value;
+  static_assert(kContainerTypeNumBits % sizeof_bits<Element>::value == 0, "The bits of ContainerType should be divisible by the element's number of bits");
+  // kContainerTypeNumLogicalElements: The number of logical Element instance(s) that can be stored per ContainerType instance
+  static constexpr int kContainerTypeNumLogicalElements = kContainerTypeNumBits / sizeof_bits<Element>::value;
+  /// 3. kContainerTypeNumBytes: The number of bytes per ContainerType instance
+  static constexpr int kContainerTypeNumBytes = kContainerTypeNumBits / 8;
+  /// 4. kContainerTypeNumBytes: The number of base StorageUnit in the ContainerType
+  static constexpr int kContainerTypeNumStorageUnit = kContainerTypeNumBits / sizeof_bits<StorageUnit>::value;
+  static_assert(kContainerTypeNumBits != 0, "kContainerTypeNumBits can not be zero");
+  static_assert(kContainerTypeNumLogicalElements != 0, "kContainerTypeNumLogicalElements can not be zero");
+  static_assert(kContainerTypeNumBytes != 0, "kContainerTypeNumBytes can not be zero");
+};
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// This class provides a mechanism for packing and unpacking elements smaller than one byte. It
+/// assumes these sub-byte elements are packed in a traditional C++ numeric type.
+///
+/// The intended application is to provide a mechanism to indirectly reference elements in
+/// memory or Array<> objects whose addresses cannot otherwise be taken since they are smaller
+/// than one byte.
+///
+/// Supports basic pointer arithmetic:
+///
+/// Example:
+///
+///   int4b_t *ptr = ...;
+///
+///   SubbyteReference<int4b_t> ref = ptr;
+///   ref += 15;
+///
+///   int4b_t x = ref;      // load an int4b_t
+///   ref = x + 2_s4;      // perform arithmetic on int4b_t and then store
+///
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_ = uint8_t,    /// Underlying storage type. Must be able to hold an integer
+                                  ///   number of objects of type Element.
+  class = void
+>
+class ConstSubbyteReference {
+public:
+  using Element = Element_;
+  using Storage = Storage_;
+  using StoragePointer = Storage const *;
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
+    "Size of Element must not be greater than Storage.");
+  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
+    "Storage must be divisible by Element");
+private:
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
+  ///! Bit mask
+  Storage const kMask =
+    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ?
+      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
+      ~Storage(0));
+private:
+  /// Pointer to array containing element
+  StoragePointer ptr_;
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+public:
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element const *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ):
+    ptr_(reinterpret_cast<StoragePointer>(ptr)),
+    offset_(0) {
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+  }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element *ptr = nullptr
+  ): ConstSubbyteReference(ptr, 0) { }
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StoragePointer storage_pointer() const {
+    return ptr_;
+  }
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    Storage item = Storage((*ptr_ >> (offset_ * sizeof_bits<Element>::value)) & kMask);
+    return reinterpret_cast<Element const &>(item);
+  }
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(int offset) {
+    offset += offset_;
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(long long offset) {
+    offset += offset_;
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(int offset) {
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(long long offset) {
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    return *this;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(int offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(long long offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-(int offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-=(long long offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(ConstSubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_ =             /// Underlying storage type. Must be able to hold an integer
+                                  ///   number of objects of type Element.
+#if defined(__CUDA_ARCH__)        /// Default size depends on width of atomicCas() overloads.
+  #if (__CUDA_ARCH__ >= 700)      ///
+  uint16_t
+  #else
+  uint32_t
+  #endif
+#else
+  uint8_t
+#endif
+  ,
+  class = void
+>
+class SubbyteReference {
+public:
+  using Element = Element_;
+  using Storage = Storage_;
+  using StoragePointer = Storage *;
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
+    "Size of Element must not be greater than Storage.");
+  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
+    "Storage must be divisible by Element");
+private:
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
+  ///! Bit mask
+  Storage const kMask =
+    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ?
+      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
+      ~Storage(0));
+private:
+  /// Pointer to array containing element
+  StoragePointer ptr_;
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+public:
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(): ptr_(nullptr), offset_(0) { }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ):
+    ptr_(reinterpret_cast<StoragePointer>(ptr)),
+    offset_(0) {
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+  }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr = nullptr
+  ): SubbyteReference(ptr, 0) { }
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StoragePointer storage_pointer() const {
+    return ptr_;
+  }
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    uint8_t const* byte_ptr = reinterpret_cast<uint8_t const*>(ptr_);
+    // Convert offset in elements to offset in bytes
+    constexpr int elements_per_byte = cutlass::sizeof_bits<uint8_t>::value / cutlass::sizeof_bits<Element>::value;
+    byte_ptr += offset_ / elements_per_byte;
+    // Offset of element within a byte
+    int byte_offset = offset_ % elements_per_byte;
+    uint8_t item = uint8_t((*byte_ptr >> (byte_offset * cutlass::sizeof_bits<Element>::value)) & kMask);
+    return reinterpret_cast<Element const &>(item);
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference & set(Element const &x) {
+    Storage item        = (reinterpret_cast<Storage const &>(x) & kMask);
+    Storage kUpdateMask = Storage(~(kMask << (offset_ * cutlass::sizeof_bits<Element>::value)));
+    Storage new_bits    = Storage(item << (offset_ * cutlass::sizeof_bits<Element>::value));
+#if defined(__CUDA_ARCH__)
+    //
+    // Homebrew read-modify-write
+    //
+    Storage original;
+    Storage updated;
+    do {
+      original = (*ptr_);
+      updated  = Storage((original & kUpdateMask) | new_bits);
+      original = atomicCAS(ptr_, original, updated);
+    } while (updated != original);
+#else
+    Storage original = (*ptr_);
+    Storage updated  = Storage((original & kUpdateMask) | new_bits);
+    *ptr_ = updated;
+#endif
+    return *this;
+  }
+  ////
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(Element const & x) {
+    return set(x);
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(SubbyteReference const & x) {
+    return set(x.get());
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(
+      ConstSubbyteReference<Element, Storage> const &x) {
+    return set(x.get());
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(int offset) {
+    offset += offset_;
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(long long offset) {
+    offset += offset_;
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(int offset) {
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(long long offset) {
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    return *this;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(int offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(long long offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-(int offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-=(long long offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(SubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying basic storage type.
+>
+class SubbyteReference<Element_, Storage_,
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+  using Element = Element_;
+  /// Note: It's possible that StorageUnit is not divisible by Element.
+  /// For example, an Element instance might be stored across 2 StorageUnit instances.
+  /// Thus, CUTLASS needs a storage vector to hold an integer number of Element instances.
+  using StorageUnit = Storage_;
+private:
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+public:
+  static int const kBitsStoredVec = StorageContainerCalculator::kContainerTypeNumBits;
+  static int const kNumStorageUnitPerStoredVec = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec *;
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+private:
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+private:
+  CUTLASS_HOST_DEVICE
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
+  }
+public:
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(): ptr_(nullptr), offset_(0) { }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ):
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+    update_element_status();
+  }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr = nullptr
+  ): SubbyteReference(ptr, 0) { }
+  /// Gets StorageVec pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
+  }
+  /// Gets StorageVec pointer
+  CUTLASS_HOST_DEVICE
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+  /// Gets element offset within StorageVec vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+    return reinterpret_cast<Element const &>(result);
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference & set(Element const &x) {
+    uint64_t item = static_cast<uint64_t>((reinterpret_cast<uint8_t const &>(x) & kMask)) << start_bit_idx_;
+    StorageUnit low_new_bits  = StorageUnit(item & ~StorageUnit(0));
+    StorageUnit high_new_bits = StorageUnit(item >> sizeof_bits<StorageUnit>::value);
+    StorageUnit const kLowUpdateMask  = StorageUnit((~full_element_mask_) & (~StorageUnit(0)));
+    StorageUnit const kHighUpdateMask = StorageUnit(((~full_element_mask_) >> sizeof_bits<StorageUnit>::value) & (~StorageUnit(0)));
+#if defined(__CUDA_ARCH__)
+    //
+    // Homebrew read-modify-write
+    //
+    if(high_storage_unit_idx_ != low_storage_unit_idx_){
+      /// Only need update 2 storage unit at once.
+      /// consider misaligned address issue, we need to do atomicCAS twice
+      StorageUnit original_low_bits, original_high_bits, update_low_bits, update_high_bits;
+      do {
+        original_low_bits  = ((*ptr_)[low_storage_unit_idx_]);
+        update_low_bits  = (original_low_bits & kLowUpdateMask) | low_new_bits;
+        original_low_bits = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original_low_bits, update_low_bits);
+      } while (update_low_bits != original_low_bits);
+      do {
+        original_high_bits = ((*ptr_)[high_storage_unit_idx_]);
+        update_high_bits  = (original_high_bits & kHighUpdateMask) | high_new_bits;
+        original_high_bits = atomicCAS(&((*ptr_)[high_storage_unit_idx_]), original_high_bits, update_high_bits);
+      } while (update_high_bits != original_high_bits);
+    }
+    else {
+      /// Only need update 1 storage unit.
+      StorageUnit original, updated;
+      do {
+        original = ((*ptr_)[low_storage_unit_idx_]);
+        updated = (original & kLowUpdateMask) | low_new_bits;
+        original = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original, updated);
+      } while (updated != original);
+    }
+#else
+    StorageUnit update_low_bits  = ((*ptr_)[low_storage_unit_idx_] & kLowUpdateMask) | low_new_bits;
+    StorageUnit update_high_bits = ((*ptr_)[high_storage_unit_idx_] & kHighUpdateMask) | high_new_bits;
+    (*ptr_)[low_storage_unit_idx_] = update_low_bits;
+    if(low_storage_unit_idx_ != high_storage_unit_idx_)
+      (*ptr_)[high_storage_unit_idx_] = update_high_bits;
+#endif
+    return *this;
+  }
+  ////
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(Element const & x) {
+    return set(x);
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(SubbyteReference const & x) {
+    return set(x.get());
+  }
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(
+      ConstSubbyteReference<Element, StorageVec> const &x) {
+    return set(x.get());
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(int offset) {
+    offset += offset_;
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    update_element_status();
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(long long offset) {
+    offset += offset_;
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    update_element_status();
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(int offset) {
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    update_element_status();
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(long long offset) {
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    update_element_status();
+    return *this;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(int offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(long long offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-(int offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-=(long long offset) const {
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(SubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying storage type. Must be able to hold an integer
+>
+class ConstSubbyteReference<Element_, Storage_,
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+  using Element = Element_;
+  ///! Note: Storage unit could not be divisibale by Element,
+  ///   Type element may be stored across 2 storage units, so need a storage vector to hold integer
+  ///   number of objects of type Element.
+  using StorageUnit = Storage_;
+  static int const kBitsStoredVec = cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value);
+  static int const kNumStorageUnitPerStoredVec = kBitsStoredVec / sizeof_bits<StorageUnit>::value;
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec const *;
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+private:
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+private:
+  CUTLASS_HOST_DEVICE
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
+  }
+public:
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element const *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ):
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+    update_element_status();
+  }
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element *ptr = nullptr
+  ): ConstSubbyteReference(ptr, 0) { }
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
+  }
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+    return reinterpret_cast<Element const &>(result);
+  }
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(int offset) {
+    offset += offset_;
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    update_element_status();
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(long long offset) {
+    offset += offset_;
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+    update_element_status();
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(int offset) {
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    update_element_status();
+    return *this;
+  }
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(long long offset) {
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+    update_element_status();
+    return *this;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(int offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(long long offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-(int offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-=(long long offset) const {
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+    return ref;
+  }
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(ConstSubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element, bool subbyte = (sizeof_bits<Element>::value < 8)>
+struct ReferenceFactory;
+template <typename Element>
+struct ReferenceFactory<Element, false> {
+  ///! Number of elements per storage vector
+  static int const kElementsPerVector = 1;
+  CUTLASS_HOST_DEVICE
+  static Element &get(Element *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+  CUTLASS_HOST_DEVICE
+  static Element const &get(Element const *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+};
+template <typename Element>
+struct ReferenceFactory<Element, true> {
+  //
+  // Static methods
+  //
+  CUTLASS_HOST_DEVICE
+  static SubbyteReference<Element> get(Element *ptr, int64_t offset) {
+    return SubbyteReference<Element>(ptr, offset);
+  }
+  CUTLASS_HOST_DEVICE
+  static ConstSubbyteReference<Element> get(Element const *ptr,
+                                             int64_t offset) {
+    return ConstSubbyteReference<Element>(ptr, offset);
+  }
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset_in_elements) {
+    return &SubbyteReference<Element>(ptr, offset_in_elements);
+  }
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset_in_elements) {
+    return &ConstSubbyteReference<Element>(ptr, offset_in_elements);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_coord.h ADDED Viewed

	@@ -0,0 +1,326 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a canonical coordinate for rank=4 tensors offering named indices.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+namespace cutlass {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a canonical 4D coordinate used by tensor operations.
+struct Tensor4DCoord : public Coord<4> {
+  /// Base class
+  using Base = Coord<4>;
+  /// Index type
+  using Index = typename Base::Index;
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+  /// Batch dimension
+  static int const kN = 0;
+  /// Height dimension
+  static int const kH = 1;
+  /// Width dimension
+  static int const kW = 2;
+  /// Channels dimension
+  static int const kC = 3;
+  //
+  // Methods
+  //
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord() { }
+  /// Constructs from Coord<4>
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(Coord<4> const &coord): Base(coord) { }
+  /// Helper to construct from N, H, W, and C.
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(Index n, Index h, Index w, Index c): Base(make_Coord(n, h, w, c)) { }
+  /// Helper to construct from N, H, W, and C, which are LongIndex type
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(LongIndex n, LongIndex h, LongIndex w, LongIndex c)
+    : Base(make_Coord(Index(n), Index(h), Index(w), Index(c))) { }
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & n() const { return this->at(kN); }
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & h() const { return this->at(kH); }
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & h() { return this->at(kH); }
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & w() const { return this->at(kW); }
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & w() { return this->at(kW); }
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & c() const { return this->at(kC); }
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & c() { return this->at(kC); }
+  //
+  // Coord operators
+  //
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator+(Base const& b) const {
+    return Tensor4DCoord(Base::operator+(b));
+  }
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator-(Base const& b) const {
+    return Tensor4DCoord(Base::operator-(b));
+  }
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator*(Base const& b) const {
+    return Tensor4DCoord(Base::operator*(b));
+  }
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator/(Base const& b) const {
+    return Tensor4DCoord(Base::operator/(b));
+  }
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a canonical 5D coordinate used by tensor operations.
+struct Tensor5DCoord : public Coord<5> {
+  /// Base class
+  using Base = Coord<5>;
+  /// Index type
+  using Index = typename Base::Index;
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+  /// Batch dimension
+  static int const kN = 0;
+  /// Depth dimension
+  static int const kD = 1;
+  /// Height dimension
+  static int const kH = 2;
+  /// Width dimension
+  static int const kW = 3;
+  /// Channels dimension
+  static int const kC = 4;
+  //
+  // Methods
+  //
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord() { }
+  /// Constructs from Coord<5>
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(Coord<5> const &coord): Base(coord) { }
+  /// Helper to construct from N, D, H, W, and C.
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(Index n, Index d, Index h, Index w, Index c): Base(make_Coord(n, d, h, w, c)) { }
+  /// Helper to construct from N, D, H, W, and C, which are LongIndex type
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(LongIndex n, LongIndex d, LongIndex h, LongIndex w, LongIndex c)
+    : Base(make_Coord(Index(n), Index(d), Index(h), Index(w), Index(c))) { }
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & n() const { return this->at(kN); }
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & d() const { return this->at(kD); }
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & d() { return this->at(kD); }
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & h() const { return this->at(kH); }
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & h() { return this->at(kH); }
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & w() const { return this->at(kW); }
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & w() { return this->at(kW); }
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & c() const { return this->at(kC); }
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & c() { return this->at(kC); }
+  //
+  // Coord operators
+  //
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator+(Base const& b) const {
+    return Tensor5DCoord(Base::operator+(b));
+  }
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator-(Base const& b) const {
+    return Tensor5DCoord(Base::operator-(b));
+  }
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator*(Base const& b) const {
+    return Tensor5DCoord(Base::operator*(b));
+  }
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator/(Base const& b) const {
+    return Tensor5DCoord(Base::operator/(b));
+  }
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref.h ADDED Viewed

	@@ -0,0 +1,419 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/subbyte_reference.h"
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Default layout function from coordinates in a tensor's index space into the n-D array held
+/// in memory.
+///
+/// All layout functions must define at least the members shown in IdentityTensorLayout<>.
+template <int Rank>
+class IdentityTensorLayout {
+public:
+  /// Logical rank of tensor
+  static int const kRank = Rank;
+  /// Rank of stride vector
+  static int const kStrideRank = Rank;
+  /// Index type used for coordinates
+  using Index = int32_t;
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+  /// Logical coordinate
+  using TensorCoord = Coord<kRank, Index>;
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index>;
+private:
+  //
+  // Data members
+  //
+  /// Stride data member
+  Stride stride_;
+public:
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  IdentityTensorLayout(Stride const &stride = Stride()): stride_(stride) { }
+  /// Returns the offset of a coordinate in linear memory
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(Coord<Rank> const &coord) const {
+    return coord.dot(stride_);
+  }
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &size) const {
+    int idx = stride_.max_dim_index();
+    return stride_[idx] * size[idx];
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
+          and layout within memory. A TensorRef combines a pointer and a Layout concept
+  Examples:
+  (These examples use helpers for matrix layouts defined in cutlass/layout/matrix.h)
+  1. Column-major matrix may be represented as a rank=2 tensor:
+    TensorRef<float, layout::ColumnMajor> A(ptr_A, ldm);
+  2. Row-major matrix may be represented as a rank=2 tensor:
+    TensorRef<float, layout::RowMajor> B(ptr_A, ldm);
+  3. An interleaved matrix may be represented as a rank=2 tensor:
+    TensorRef<int8_t, layout::ColumnMajorInterleaved<32> > C;
+  4. A helper exists to define a TensorRef for a contiguous matrix whose layout
+     is not known at compile time.
+    int ldm;                     // leading dimension
+    layout::Matrix kind;         // Could be layout::Matrix::kRowMajor or layout::Matrix::kColumnMajor
+    TensorRef<int, layout::ContiguousMatrix> E(ptr_E, {ldm, kind});
+*/
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class TensorRef {
+ public:
+  /// Data type of individual access
+  using Element = Element_;
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+  /// Reference type to an element
+  using Reference = typename platform::conditional<
+    sizeof_bits<Element>::value >= 8,
+    Element &,
+    SubbyteReference<Element>
+    >::type;
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+  /// Index type
+  using Index = typename Layout::Index;
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+  /// TensorRef to constant data
+  using ConstTensorRef = TensorRef<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+  /// TensorRef to non-constant data
+  using NonConstTensorRef = TensorRef<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+ private:
+  /// Pointer
+  Element* ptr_;
+  /// Layout object maps logical coordinates to linear offsets
+  Layout layout_;
+ public:
+  //
+  // Methods
+  //
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRef(): ptr_(nullptr) {
+  }
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRef(
+    Element *ptr,                   ///< pointer to start of tensor
+    Layout const &layout            ///< layout object containing stride and mapping function
+  ):
+    ptr_(ptr), layout_(layout) {
+  }
+  /// Converting constructor from TensorRef to non-constant data.
+  template<typename _Magic = int>
+  CUTLASS_HOST_DEVICE
+  TensorRef(
+    NonConstTensorRef const &ref,              ///< TensorRef to non-const data
+    ///SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
+    _Magic magic = (typename platform::enable_if< ! platform::is_same<NonConstTensorRef, TensorRef<Element_, Layout_> >::value, _Magic>::type)0
+  ):
+    ptr_(ref.data()), layout_(ref.layout()) { }
+  /// Returns a reference to constant-valued tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(ptr_, layout_);
+  }
+  CUTLASS_HOST_DEVICE
+  NonConstTensorRef non_const_ref() const {
+    return NonConstTensorRef(const_cast<typename platform::remove_const<Element>::type *>(ptr_), layout_);
+  }
+  /// Updates only the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr = nullptr) {
+    ptr_ = ptr;
+  }
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout) {
+    ptr_ = ptr;
+    layout_ = layout;
+  }
+  /// Returns true if the TensorRef is non-null
+  CUTLASS_HOST_DEVICE
+  bool good() const {
+    return ptr_ != nullptr;
+  }
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * data() const { return ptr_; }
+  /// Returns a reference to the element at a given linear index
+  CUTLASS_HOST_DEVICE
+  Reference data(LongIndex idx) const {
+    return ReferenceFactory<typename platform::remove_const<Element>::type,
+                            (sizeof_bits<Element>::value < 8)>::get(ptr_, idx);
+  }
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout & layout() {
+    return layout_;
+  }
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout layout() const {
+    return layout_;
+  }
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  typename Layout::Stride::Index stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  typename Layout::Stride::Index & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+  /// Computes the offset of an index from the origin of the tensor
+  CUTLASS_HOST_DEVICE
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference operator[](TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRef & add_pointer_offset(LongIndex offset_) {
+    ptr_ = ReferenceFactory<typename platform::remove_const<Element>::type,
+           (sizeof_bits<Element>::value < 8)>::add_pointer_offset(ptr_, offset_);
+    return *this;
+  }
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRef & add_coord_offset(TensorCoord const &coord) {
+    add_pointer_offset(offset(coord));
+    return *this;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef operator+(TensorCoord const& b) const {
+    TensorRef result(*this);
+    result.add_coord_offset(b);
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef & operator+=(TensorCoord const& b) {
+    add_coord_offset(b);
+    return *this;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef operator-(TensorCoord const& b) const {
+    TensorRef result(*this);
+    result.add_pointer_offset(-offset(b));
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef & operator-=(TensorCoord const& b) {
+    add_pointer_offset(-offset(b));
+    return *this;
+  }
+};
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+TensorRef<Element, Layout> make_TensorRef(Element *ptr, Layout const &layout) {
+  return TensorRef<Element, Layout>(ptr, layout);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations to handle degenerate and sub-byte cases.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+bool TensorRef_aligned(TensorRef<Element, Layout> const &ref, int alignment) {
+  int const kStrideRank = Layout::kStrideRank;
+  if (reinterpret_cast<uintptr_t>(ref.data()) % alignment) {
+    return false;
+  }
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < kStrideRank; ++i) {
+    if (ref.stride(i) % alignment) {
+      return false;
+    }
+  }
+  return true;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref_planar_complex.h ADDED Viewed

	@@ -0,0 +1,374 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
+*/
+#pragma once
+#include <cstdint>
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Element_>
+struct PlanarComplexReference {
+  //
+  // Type definitions
+  //
+  using Element = Element_;
+  using ComplexElement = complex<Element>;
+  //
+  // Data members
+  //
+  Element *real;
+  Element *imag;
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  PlanarComplexReference(
+    Element *real_ = nullptr,
+    Element *imag_ = nullptr
+  ):
+    real(real_), imag(imag_) { }
+  /// Loads the complex element
+  CUTLASS_HOST_DEVICE
+  operator complex<Element>() const {
+    return complex<Element>{*real, *imag};
+  }
+  /// Stores a complex element to the location pointed to by the reference
+  CUTLASS_HOST_DEVICE
+  PlanarComplexReference &operator=(complex<Element> const &rhs) {
+    *real = rhs.real();
+    *imag = rhs.imag();
+    return *this;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
+          and layout within memory. A TensorRef combines a pointer and a Layout concept
+*/
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class TensorRefPlanarComplex {
+ public:
+  /// Data type of individual access
+  using Element = Element_;
+  /// Complex element type
+  using ComplexElement = complex<Element>;
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "Planar complex not suitable for subbyte elements at this time");
+  /// Reference type to an element
+  using Reference = PlanarComplexReference<Element>;
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+  /// Index type
+  using Index = typename Layout::Index;
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+  /// TensorRef to constant data
+  using ConstTensorRef = TensorRefPlanarComplex<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+  /// TensorRef to non-constant data
+  using NonConstTensorRef = TensorRefPlanarComplex<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+ private:
+  /// Pointer
+  Element* ptr_;
+  /// Layout object maps logical coordinates to linear offsets
+  Layout layout_;
+  /// Offset to imaginary part
+  LongIndex imaginary_stride_;
+ public:
+  //
+  // Methods
+  //
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex(
+    Element *ptr = nullptr,                   ///< pointer to start of tensor
+    Layout const &layout = Layout(),          ///< layout object containing stride and mapping function
+    LongIndex imaginary_stride = 0
+  ):
+    ptr_(ptr), layout_(layout), imaginary_stride_(imaginary_stride) {
+  }
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex(
+    NonConstTensorRef const &ref              ///< TensorRef to non-const data
+  ):
+    ptr_(ref.data()), layout_(ref.layout()), imaginary_stride_(ref.imaginary_stride_) { }
+  /// Returns a reference to constant-valued tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(ptr_, layout_, imaginary_stride_);
+  }
+  CUTLASS_HOST_DEVICE
+  NonConstTensorRef non_const_ref() const {
+    return NonConstTensorRef(
+      const_cast<typename platform::remove_const<Element>::type *>(ptr_),
+      layout_,
+      imaginary_stride_);
+  }
+  /// Updates only the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr = nullptr, LongIndex imaginary_stride = 0) {
+    ptr_ = ptr;
+    imaginary_stride_ = imaginary_stride;
+  }
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride) {
+    ptr_ = ptr;
+    layout_ = layout;
+    imaginary_stride_ = imaginary_stride;
+  }
+  /// Returns true if the TensorRef is non-null
+  CUTLASS_HOST_DEVICE
+  bool good() const {
+    return ptr_ != nullptr;
+  }
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * data() const { return ptr_; }
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * imaginary_data() const { return ptr_ + imaginary_stride_; }
+  /// Returns a reference to the element at a given linear index
+  CUTLASS_HOST_DEVICE
+  Reference data(LongIndex idx) const {
+    return Reference(ptr_ + idx, ptr_ + idx + imaginary_stride_);
+  }
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout & layout() {
+    return layout_;
+  }
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout layout() const {
+    return layout_;
+  }
+  /// Gets the stride to an imaginary element
+  LongIndex imaginary_stride() const {
+    return imaginary_stride_;
+  }
+  /// Gets the stride to an imaginary element
+  LongIndex &imaginary_stride() {
+    return imaginary_stride_;
+  }
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  Index stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  Index & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+  /// Computes the offset of an index from the origin of the tensor
+  CUTLASS_HOST_DEVICE
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference operator[](TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & add_pointer_offset(LongIndex offset_) {
+    ptr_ += offset_;
+    return *this;
+  }
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & add_coord_offset(TensorCoord const &coord) {
+    add_pointer_offset(offset(coord));
+    return *this;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex operator+(TensorCoord const& b) const {
+    TensorRefPlanarComplex result(*this);
+    result.add_coord_offset(b);
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & operator+=(TensorCoord const& b) {
+    add_coord_offset(b);
+    return *this;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex operator-(TensorCoord const& b) const {
+    TensorRefPlanarComplex result(*this);
+    result.add_pointer_offset(-offset(b));
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & operator-=(TensorCoord const& b) {
+    add_pointer_offset(-offset(b));
+    return *this;
+  }
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorRef<Element, Layout> ref_real() const {
+    return cutlass::TensorRef<Element, Layout>(data(), layout());
+  }
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorRef<Element, Layout> ref_imag() const {
+    return cutlass::TensorRef<Element, Layout>(imaginary_data(), layout());
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+TensorRefPlanarComplex<Element, Layout> make_TensorRefPlanarComplex(
+  Element *ptr,
+  Layout const &layout,
+  int64_t imaginary_stride) {
+  return TensorRefPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view.h ADDED Viewed

	@@ -0,0 +1,297 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides and a pointer to tensor data.
+    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
+    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
+    data storage and is therefore lightweight and may be embedded in larger tensor objects or
+    memory structures.
+    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
+    linear memory.
+*/
+#pragma once
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#endif
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+namespace cutlass {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Data type of element stored within tensor
+  typename Element_,
+  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
+  typename Layout_
+>
+class TensorView : public TensorRef<Element_, Layout_> {
+ public:
+  /// Base tensor reference
+  using Base = cutlass::TensorRef<Element_, Layout_>;
+  /// Mapping function from logical coordinate to internal n-D array
+  using Layout = Layout_;
+  /// TensorRef pointing to constant memory
+  using ConstTensorRef = typename Base::ConstTensorRef;
+  /// Underlying TensorRef type
+  using TensorRef = Base;
+  /// Data type of individual access
+  using Element = Element_;
+  /// Reference type to an element
+  using Reference = Element &;
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+  /// Index type
+  using Index = typename Layout::Index;
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Coordinate in storage n-D array
+  using Stride = typename Layout::Stride;
+  /// TensorView pointing to constant memory
+  using ConstTensorView = TensorView<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+  /// TensorView pointing to non-constant memory
+  using NonConstTensorView = TensorView<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+ private:
+  /// View extent
+  TensorCoord extent_;
+ public:
+  //
+  // Methods
+  //
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView() { }
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    Element *ptr,                         ///< pointer to start of tensor
+    Layout const &layout,                 ///< layout object containing stride and mapping function
+    TensorCoord const &extent             ///< size of the view in logical coordinates
+  ):
+    Base(ptr, layout), extent_(extent) {
+  }
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
+    TensorCoord const &extent             ///< logical size of tensor
+  ):
+    Base(ref), extent_(extent) {
+  }
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    NonConstTensorView const &view        ///< TensorView to non-const data
+  ):
+    Base(view), extent_(view.extent_) { }
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) {
+    Base::reset(ptr, layout);
+    this->resize(extent);
+  }
+  /// Updates the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr) {
+    Base::reset(ptr);
+  }
+  /// Changes the size of the view without affecting pointer or layout
+  CUTLASS_HOST_DEVICE
+  void resize(TensorCoord const &extent) {
+    this->extent_ = extent;
+  }
+  /// Returns the extent of the view (the size along each logical dimension).
+  CUTLASS_HOST_DEVICE
+  TensorCoord const& extent() const { return extent_; }
+  /// Returns the extent along a particular logical dimension.
+  CUTLASS_HOST_DEVICE
+  Index extent(int dim) const { return extent_.at(dim); }
+  /// Returns the number of logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex size() const {
+    return extent_.product();
+  }
+  /// Determines whether a location is within a tensor
+  CUTLASS_HOST_DEVICE
+  bool contains(TensorCoord const& coord) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int dim = 0; dim < kRank; ++dim) {
+      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorRef ref() const {
+    return TensorRef(this->data(), this->layout());
+  }
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), this->layout());
+  }
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent_);
+  }
+  /// Returns a Tensor_view given location and size quantities
+  CUTLASS_HOST_DEVICE
+  TensorView subview(
+    TensorCoord extent,                               ///< extent of the resulting view
+    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
+  ) const {
+    TensorView result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result;
+  }
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  size_t capacity() const {
+    return Base::layout().capacity(extent_);
+  }
+  /// Returns a TensorView offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView operator+(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+    TensorView result(*this);
+    result.add_pointer_offset(this->offset(b));
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView& operator+=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+    this->add_pointer_offset(this->offset(b));
+    return *this;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView operator-(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+    TensorRef result(*this);
+    result.add_pointer_offset(-this->offset(b));
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView& operator-=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+    this->add_pointer_offset(-this->offset(b));
+    return *this;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE TensorView<Element, Layout> make_TensorView(
+  Element *ptr,
+  Layout const &layout,
+  typename Layout::TensorCoord const &extent) {
+  return TensorView<Element, Layout>(ptr, layout, extent);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view_planar_complex.h ADDED Viewed

	@@ -0,0 +1,302 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides and a pointer to tensor data.
+    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
+    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
+    data storage and is therefore lightweight and may be embedded in larger tensor objects or
+    memory structures.
+    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
+    linear memory.
+*/
+#pragma once
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#endif
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+#include "cutlass/tensor_view.h" // cutlass::TensorView
+namespace cutlass {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Data type of element stored within tensor
+  typename Element_,
+  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
+  typename Layout_
+>
+class TensorViewPlanarComplex : public TensorRefPlanarComplex<Element_, Layout_> {
+ public:
+  /// Base tensor reference
+  using Base = cutlass::TensorRefPlanarComplex<Element_, Layout_>;
+  /// Mapping function from logical coordinate to internal n-D array
+  using Layout = Layout_;
+  /// TensorRef pointing to constant memory
+  using ConstTensorRef = typename Base::ConstTensorRef;
+  /// Underlying TensorRef type
+  using TensorRef = Base;
+  /// Data type of individual access
+  using Element = Element_;
+  /// Reference type to an element
+  using Reference = Element &;
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+  /// Index type
+  using Index = typename Layout::Index;
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Coordinate in storage n-D array
+  using Stride = typename Layout::Stride;
+  /// TensorView pointing to constant memory
+  using ConstTensorView = TensorViewPlanarComplex<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+  /// TensorView pointing to non-constant memory
+  using NonConstTensorView = TensorViewPlanarComplex<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+ private:
+  /// View extent
+  TensorCoord extent_;
+ public:
+  //
+  // Methods
+  //
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(TensorCoord const &extent = TensorCoord()): extent_(extent) {
+  }
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    Element *ptr,                         ///< pointer to start of tensor
+    Layout const &layout,                 ///< layout object containing stride and mapping function
+    LongIndex imaginary_stride,           ///< stride between real and imaginary part
+    TensorCoord const &extent             ///< size of the view in logical coordinates
+  ):
+    Base(ptr, layout, imaginary_stride), extent_(extent) {
+  }
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
+    TensorCoord const &extent             ///< logical size of tensor
+  ):
+    Base(ref), extent_(extent) {
+  }
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    NonConstTensorView const &view        ///< TensorView to non-const data
+  ):
+    Base(view), extent_(view.extent_) { }
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride, TensorCoord size) {
+    Base::reset(ptr, layout, imaginary_stride);
+    this->resize(extent_);
+  }
+  /// Changes the size of the view without affecting pointer or layout
+  CUTLASS_HOST_DEVICE
+  void resize(TensorCoord extent) {
+    this->extent_ = extent;
+  }
+  /// Returns the extent of the view (the size along each logical dimension).
+  CUTLASS_HOST_DEVICE
+  TensorCoord const& extent() const { return extent_; }
+  /// Returns the extent along a particular logical dimension.
+  CUTLASS_HOST_DEVICE
+  Index extent(int dim) const { return extent_.at(dim); }
+  /// Determines whether a location is within a tensor
+  CUTLASS_HOST_DEVICE
+  bool contains(TensorCoord const& coord) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int dim = 0; dim < kRank; ++dim) {
+      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  Base ref() const {
+    return Base(this->data(), this->layout(), this->imaginary_stride());
+  }
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), this->layout());
+  }
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent_);
+  }
+  /// Returns a Tensor_view given location and size quantities
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex subview(
+    TensorCoord extent,                               ///< extent of the resulting view
+    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
+  ) const {
+    TensorViewPlanarComplex result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result;
+  }
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  size_t capacity() const {
+    return Base::layout().capacity(extent_);
+  }
+  /// Returns a TensorView offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex operator+(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+    TensorViewPlanarComplex result(*this);
+    result.add_pointer_offset(this->offset(b));
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex& operator+=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+    this->add_pointer_offset(this->offset(b));
+    return *this;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex operator-(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+    TensorRef result(*this);
+    result.add_pointer_offset(-this->offset(b));
+    return result;
+  }
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex& operator-=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+    this->add_pointer_offset(-this->offset(b));
+    return *this;
+  }
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorView<Element, Layout> view_real() const {
+    return cutlass::TensorView<Element, Layout>(this->data(), this->layout(), extent_);
+  }
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorView<Element, Layout> view_imag() const {
+    return cutlass::TensorView<Element, Layout>(this->imaginary_data(), this->layout(), extent_);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE TensorViewPlanarComplex<Element, Layout> make_TensorViewPlanarComplex(
+  Element *ptr,
+  Layout const &layout,
+  typename Layout::LongIndex imaginary_stride,
+  typename Layout::TensorCoord const &extent) {
+  return TensorViewPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride, extent);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/tfloat32.h ADDED Viewed

	@@ -0,0 +1,479 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing Tensor Float 32 data type.
+*/
+#pragma once
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring> // std::memcpy
+#endif
+#include "cutlass/cutlass.h"
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Tensor Float 32 data type
+struct alignas(4) tfloat32_t {
+  //
+  // Data members
+  //
+  /// Storage type
+  uint32_t storage;
+  //
+  // Methods
+  //
+  private:
+    CUTLASS_HOST_DEVICE
+    static uint32_t float_to_storage(float s) {
+  #if defined(__CUDA_ARCH__)
+      uint32_t result = reinterpret_cast<uint32_t const &>(s);
+  #else
+      uint32_t result;
+      std::memcpy(&result, &s, sizeof(float));
+  #endif
+      return result;
+    }
+  public:
+  /// Constructs from an unsigned int
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t bitcast(uint32_t x) {
+    tfloat32_t h;
+    h.storage = x;
+    return h;
+  }
+  /// Emulated rounding is fast in device code
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t round_half_ulp_truncate(float const &s) {
+    uint32_t x = float_to_storage(s);
+    #if defined(__CUDA_ARCH__)
+    if (::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #else
+    if (std::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #endif
+    return tfloat32_t::bitcast(x);
+  }
+  tfloat32_t() = default;
+  /// Floating-point conversion - round toward nearest even
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).raw()) { }
+  // Conversion from double (this rounds twice)
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(double x): tfloat32_t(float(x)) { }
+  /// Integer conversion - round toward zero
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(int x) {
+    float flt = static_cast<float>(x);
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint32_t const &>(flt);
+    #else
+    std::memcpy(&storage, &flt, sizeof(storage));
+    #endif
+  }
+  // Conversion to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    // Conversions to IEEE single-precision requires clearing dont-care bits
+    // of the mantissa.
+    unsigned bits = (storage & ~0x1fffu);
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const &>(bits);
+    #else
+    float flt;
+    std::memcpy(&flt, &bits, sizeof(flt));
+    return flt;
+    #endif
+  }
+  /// Converts to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(float(*this));
+  }
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint32_t raw() const {
+    return storage;
+  }
+  /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x80000000) != 0);
+  }
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 23) & 0x0ff);
+  }
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7fffff);
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::tfloat32_t const& h) {
+  return h.signbit();
+}
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t abs(cutlass::tfloat32_t const& h) {
+  return cutlass::tfloat32_t::bitcast(h.raw() & 0x7fffffff);
+}
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t nan_tf32(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::tfloat32_t::bitcast(0x7fffffff);
+}
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::tfloat32_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::tfloat32_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::tfloat32_t(sqrtf(float(h)));
+#else
+  return cutlass::tfloat32_t(std::sqrt(float(h)));
+#endif
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) {
+  uint32_t a_mag = (a.raw() & 0x7fffffff);
+  uint32_t b_sign = (b.raw() & 0x80000000);
+  uint32_t result = (a_mag | b_sign);
+  return tfloat32_t::bitcast(result);
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace std {
+#if !defined(__CUDACC_RTC__)
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::tfloat32_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 19;
+  /// Least positive value
+  static cutlass::tfloat32_t min() { return cutlass::tfloat32_t::bitcast(0x01); }
+  /// Minimum finite value
+  static cutlass::tfloat32_t lowest() { return cutlass::tfloat32_t::bitcast(0xff7fffff); }
+  /// Maximum finite value
+  static cutlass::tfloat32_t max() { return cutlass::tfloat32_t::bitcast(0x7f7fffff); }
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t epsilon() { return cutlass::tfloat32_t::bitcast(0x1000); }
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t round_error() { return cutlass::tfloat32_t(0.5f); }
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t infinity() { return cutlass::tfloat32_t::bitcast(0x7f800000); }
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t quiet_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t signaling_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t denorm_min() { return cutlass::tfloat32_t::bitcast(0x1); }
+};
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace std
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+CUTLASS_HOST_DEVICE
+bool operator==(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) == float(rhs);
+}
+CUTLASS_HOST_DEVICE
+bool operator!=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) != float(rhs);
+}
+CUTLASS_HOST_DEVICE
+bool operator<(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) < float(rhs);
+}
+CUTLASS_HOST_DEVICE
+bool operator<=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) <= float(rhs);
+}
+CUTLASS_HOST_DEVICE
+bool operator>(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) > float(rhs);
+}
+CUTLASS_HOST_DEVICE
+bool operator>=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) >= float(rhs);
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) + float(rhs));
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs) {
+  return tfloat32_t::bitcast(0x80000000 ^ lhs.raw());
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) - float(rhs));
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t operator*(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) * float(rhs));
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t operator/(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) / float(rhs));
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator+=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) + float(rhs));
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator-=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) - float(rhs));
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator*=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) * float(rhs));
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator/=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) / float(rhs));
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator++(tfloat32_t & lhs) {
+  float tmp(lhs);
+  ++tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator--(tfloat32_t & lhs) {
+  float tmp(lhs);
+  --tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t operator++(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp++;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+CUTLASS_HOST_DEVICE
+tfloat32_t operator--(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp--;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// User-defined literals
+//
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(long double x) {
+  return cutlass::tfloat32_t(float(x));
+}
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(unsigned long long int x) {
+  return cutlass::tfloat32_t(int(x));
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/thread/matrix.h ADDED Viewed

	@@ -0,0 +1,198 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a matrix object intended for storing data in registers and operations within
+      a CUDA thread.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/matrix_coord.h"
+namespace cutlass {
+namespace thread {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Per-thread matrix object storing a packed matrix
+template <
+  typename Element,
+  int Rows,
+  int Columns,
+  typename Layout = layout::RowMajor
+>
+class Matrix : public Array<Element, Rows * Columns> {
+public:
+  // Verify layout refers to a rank=2 matrix.
+  static_assert(
+    Layout::kRank == 2,
+    "Layout type must refer to a rank=2 matrix");
+  /// Base type
+  using Base = Array<Element, Rows * Columns>;
+  /// Element type
+  using Element = Element_;
+  /// Number of rows
+  static int const kRows = Rows;
+  /// Number of columns
+  static int const kColumns = Columns;
+  /// Layout within the array
+  using Layout = Layout_;
+  /// Reference type to an element
+  using Reference = Element &;
+  /// Logical rank of tensor index space
+  static int const kRank = 2;
+  /// Index type
+  using Index = typename Layout::Index;
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Stride type
+  using Stride = typename Layout::Stride;
+  /// TensorRef to matrix object
+  using TensorRef = TensorRef<Element, kRank, Layout>;
+  /// TensorRef to constant matrix object
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+  /// TensorRef to matrix object
+  using TensorView = TensorView<Element, kRank, Layout>;
+  /// TensorRef to constant matrix object
+  using ConstTensorView = typename TensorView::ConstTensorView;
+  /// Diagonal vector
+  using Diagonal = Vector<Element, __NV_STD_MIN(kRows, kColumns)>;
+private:
+public:
+  //
+  // Methods
+  //
+  /// Returns the size of the object
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord extent() {
+    return make_Coord(kRows, kColumns);
+  }
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  static Layout layout() {
+    return Layout::packed(extent());
+  }
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  Matrix() { }
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  Matrix(Diagonal const &diag) {
+  }
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorRef ref() {
+    return TensorRef(this->data(), layout());
+  }
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), layout());
+  }
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorView view() {
+    return TensorView(ref(), extent());
+  }
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent());
+  }
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(MatrixCoord const& coord) const {
+    typename Base::size_type offset_(layout().offset(coord));
+    return Base::at(offset_);
+  }
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity() const {
+    return LongIndex(Base::size());
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Column vector defined as a matrix with exactly one column
+template <
+  typename Element,
+  int Rows,
+  typename Layout = layout::ColumnMajor
+>
+using ColumnVector = Matrix<Element, Rows, 1, Layout>;
+/// Row vector defined as a matrix with exactly one row
+template <
+  typename Element,
+  int Columns,
+  typename Layout = layout::RowMajor
+>
+using RowVector = Matrix<Element, 1, Columns, Layout>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace thread
+} // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/trace.h ADDED Viewed

	@@ -0,0 +1,59 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for optionally tracing through code when debugging.
+    This file is to be included after all other headers.
+*/
+#pragma once
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Tracing options
+#ifndef CUTLASS_DEBUG_TRACE_LEVEL
+#define CUTLASS_DEBUG_TRACE_LEVEL 0
+#endif
+#if CUTLASS_DEBUG_TRACE_LEVEL
+#include <iostream>
+#include "cutlass/core_io.h"
+#if defined(__CUDA_ARCH__)
+#define CUTLASS_TRACE_HOST(x)
+#else
+#define CUTLASS_TRACE_HOST(x) { std::cout << __FILE__ << ":" << __LINE__ << "  " << x << std::endl; }
+#endif
+#else
+#define CUTLASS_TRACE_HOST(x)
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp ADDED Viewed

	@@ -0,0 +1,754 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing how threads are mapped to a given tile.
+*/
+#pragma once
+#include "cute/arch/mma_sm90_gmma.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace collective {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+using namespace cute;
+template <bool Transpose, class SmemLayoutAtom, class ElementType>
+constexpr auto
+gmma_smem_transpose_or_passthrough() {
+  if constexpr (Transpose) {
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW32_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<SmemLayoutAtom>, "Unsupported Layout_SW_Atom for B SMEM transposition");
+    }
+  }
+  else {
+    return SmemLayoutAtom{};
+  }
+}
+template <class SmemCopyAtom, class ElementType>
+constexpr auto
+use_universal_transposition() {
+  if constexpr (sizeof(ElementType) == 1) {
+    return !cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemCopyAtom>;
+  }
+  else if constexpr (sizeof(ElementType) == 4){
+    // Only universal transposition can handle SW64 and Non swizzle SMEM layout
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemCopyAtom> ||
+                  cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemCopyAtom>) {
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementType>, "Unsupported ElementType for B SMEM transposition");
+  }
+}
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class NoTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  constexpr CUTLASS_HOST_DEVICE
+  NoTranspositionOperandB(
+      int,
+      int,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB) { }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int, int) { }
+  CUTLASS_DEVICE void synchronize(int) { }
+  CUTLASS_DEVICE void synchronize() { }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int) { }
+};
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class UniversalTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  constexpr CUTLASS_HOST_DEVICE
+  UniversalTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_) { }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage, int current_step) {
+      if (current_step > 0) {
+        return;
+      }
+      constexpr int NumMathWarpGroup = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+      static_assert(NumMathWarpGroup == 1 ||
+                    (!detail::use_universal_transposition<SmemLayoutAtomB, ElementB>() && NumMathWarpGroup == 2),
+                    "Wrong math warp group number for TransposeB");
+      constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+      constexpr int BytesPerSmemSwizzleUnit = 16;
+      constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// Universal transposition, need warp_group sync between load and store.
+      /// The number of reg used depends on the input elementB.
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /*
+          In one copy step, a warp group would load WarpgroupTileSize * WarpgroupTileSize tile then store to transposed location.
+          In warp_group_tile, each warp holds Four WarpTileSize x WarpTileSize elements:
+                    K
+              ------------
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step 0
+            | W0 W1 W2 W3  ---
+                  ....
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step n
+            | W0 W1 W2 W3  ---
+      */
+      static_assert((NumThreadsPerWarpGroup % WarpThreadShapeN == 0), "Unsupported warp thread layout.");
+      constexpr auto WarpgroupThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<NumThreadsPerWarpGroup / WarpThreadShapeN>{}));
+      // Get copy tile and partition to each thread
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpgroupThreadLayout,                           // thr_layout
+        Layout<_1>{}                                     // val_layout
+      );
+      static_assert(size(sB_tiled_copy) == size(TiledMma{}), "Wrong thread number in TiledCopy.");
+      auto sB_thr_copy        = sB_tiled_copy.get_thread_slice(warp_group_thread_idx);
+      Tensor tCsB             = sB_thr_copy.partition_S(     sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+      Tensor tCsB_transposed  = sB_thr_copy.partition_D(gmma_sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+      // Divide partitioned tile to limit register usage
+      constexpr int  CopySteps      = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+      constexpr auto CopyTileShape  = make_shape(size<0>(tCsB), Int< size<1>(tCsB) / CopySteps >{}, size<2>(tCsB));
+      static_assert(size<1>(tCsB) % CopySteps == 0, "CopySteps must evenly divide rank 1 size of partitioned SMEM.");
+      Tensor tCsB_copy_tile            = zipped_divide(tCsB, CopyTileShape);
+      Tensor tCsB_copy_tile_transposed = zipped_divide(tCsB_transposed, CopyTileShape);
+      auto   transpose_fragment        = make_fragment_like(tCsB_copy_tile(_,_0{}));
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step = 0; step < CopySteps; ++step) {
+        copy(sB_tiled_copy, tCsB_copy_tile(_,step), transpose_fragment);
+        // Make sure all elements are read before being overwritten
+        __syncthreads();
+        copy(sB_tiled_copy, transpose_fragment, tCsB_copy_tile_transposed(_,step));
+      }
+  }
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+  CUTLASS_DEVICE void synchronize() {
+    // SMEM fence to make sure B is transposed before math
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+  }
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+};
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  static constexpr int Steps             = 2;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invalid warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+  static_assert(size<2>(typename TiledMma::AtomShape_MNK{}) <= WarpThreadShapeK,
+      "Need to be able to transpose first k-block in the first step");
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+      if (current_step >= StepsPerWarpGroup) {
+        return;
+      }
+      static constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// A warp group uses 2 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+      /// In each step, one warp would hold two warp_tiles.
+      ///  Step 0:                Step 1:
+      ///  W0 W1 W2 W3            -- -- -- --
+      ///  W1 W0 -- --            -- -- W3 W2
+      ///  W2 -- -- --            -- W3 W0 W1
+      ///  W3 -- -- --            -- W2 W1 W0
+      ///
+      /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///
+      /// Fully static coord LUT to avoid extra register use.
+      /// [warp_id][step][warp_tile][n / k]
+      /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+      /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+      /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+      /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+      /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+      ///
+      /// Encoding the coord of warp tile0 into two int64_t values.
+      /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+      /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+      /// The 2-step transposition and the 8-step transposition share the same encoding.
+      ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // Divide entire SMEM to multiple warp_tiles
+      constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+      Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+      Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+      // Get copy tile
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpThreadLayout,     // thr_layout
+        Layout<_1>{}          // val_layout
+      );
+      static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+      auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+      // Construct fragments for transposition
+      Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+      decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+        make_fragment_like(tmp_tCsB),
+        make_fragment_like(tmp_tCsB)
+      };
+      [[maybe_unused]] int step = current_step * NumMathWarpGroup;
+      if constexpr (NumMathWarpGroup == 2) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        step += warp_idx / (NumWarpsPerWarpGroup * 2);
+      }
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT >> (NumBitsPerStep * current_step);
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT >> (NumBitsPerStep * current_step);
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+      // decoding the warp tile coord.
+      int warp_tile0_n, warp_tile0_k;
+      if constexpr (StepsPerWarpGroup <= NumStepsEncoded) {
+        warp_tile0_n = tmp_warp_tile_n_coord_LUT & MaskPerStep;
+        warp_tile0_k = tmp_warp_tile_k_coord_LUT & MaskPerStep;
+      } else {
+        warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+      }
+      int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+      int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+        static_assert(TilesPerWarp == 2);
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+      } // loop warp_group_tile
+  }
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step < StepsPerWarpGroup) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < StepsPerWarpGroup; ++i) {
+      this->operator()(sB, gmma_sB, read_stage, i);
+    }
+    synchronize();
+  }
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB_1BElementB {
+public:
+  static_assert(sizeof(ElementB_) == 1);
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  static constexpr int Steps             = 8;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invalid warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB_1BElementB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+    if (current_step > 0) {
+      return;
+    }
+    constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+    ///  Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
+    ///  Step 0:                   Step 1:                   Step 2:                   Step 3:
+    ///  W0 W1 W2 W3 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W1 W0 -- -- -- -- -- --   -- -- W3 W2 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W3 W0 W1 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W2 W1 W0 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W1 W0 -- --   -- -- -- -- -- -- W3 W2
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W2 -- -- --   -- -- -- -- -- W3 W0 W1
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W3 -- -- --   -- -- -- -- -- W2 W1 W0
+    ///
+    ///  Step 4:                   Step 5:                   Step 6:                   Step 7:
+    ///  -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3
+    ///  W0 -- -- -- -- -- -- --   -- W0 -- -- -- -- -- --   -- -- W0 -- -- -- -- --   -- -- -- W0 -- -- -- --
+    ///  W1 -- -- -- -- -- -- --   -- W1 -- -- -- -- -- --   -- -- W1 -- -- -- -- --   -- -- -- W1 -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W2 -- -- -- -- -- --   -- -- W2 -- -- -- -- --   -- -- -- W2 -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W3 -- -- -- -- -- --   -- -- W3 -- -- -- -- --   -- -- -- W3 -- -- -- --
+    ///
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///
+    /// Fully static coord LUT to avoid extra register use.
+    /// [warp_id][step][warp_tile][n / k]
+    /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+    /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+    /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+    /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+    /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+    ///
+    /// Encoding the coord of warp tile0 into two int64_t values.
+    /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+    /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+    /// The 2-step transposition and the 8-step transposition share the same encoding.
+    ///
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Divide entire SMEM to multiple warp_tiles
+    constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+    Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+    Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+    // Get copy tile
+    auto sB_tiled_copy = make_tiled_copy(
+      Copy_Atom<DefaultCopy, ElementB>{},
+      WarpThreadLayout,     // thr_layout
+      Layout<_1>{}          // val_layout
+    );
+    static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+    auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+    // Construct fragments for transposition
+    Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+    decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+      make_fragment_like(tmp_tCsB),
+      make_fragment_like(tmp_tCsB)
+    };
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT;
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT;
+      constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step_per_warp_group = 0; step_per_warp_group < StepsPerWarpGroup; ++step_per_warp_group) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        int step = step_per_warp_group * NumMathWarpGroup + warp_idx / (NumWarpsPerWarpGroup * 2);
+        // decoding the warp tile coord.
+        int warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        int warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+        int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+        int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep;
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep;
+        static_assert(TilesPerWarp == 2);
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+      } // lock step
+    } // loop warp_group_tile
+  }
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+  }
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+template<
+  class TiledMma,
+  class SmemLayoutB,
+  class SmemLayoutAtomB,
+  class ElementB,
+  bool TransposeB
+>
+constexpr CUTLASS_HOST_DEVICE
+auto
+make_transpose_operand_b(
+    int warp_idx,
+    int warp_group_thread_idx,
+    TiledMma,
+    SmemLayoutB,
+    SmemLayoutAtomB,
+    ElementB,
+    cute::bool_constant<TransposeB>)
+{
+  if constexpr (!TransposeB) {
+    return NoTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (use_universal_transposition<SmemLayoutAtomB, ElementB>()) {
+    return UniversalTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (sizeof(ElementB) == 1) {
+    return AsyncTranspositionOperandB_1BElementB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else {
+    return AsyncTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+}
+}; // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace collective
+} // namespace transform
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp ADDED Viewed

	@@ -0,0 +1,303 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Transform Kernel Universal adapter
+*/
+#pragma once
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass::transform::device {
+////////////////////////////////////////////////////////////////////////////////
+template <class TransformKernel_>
+class TransformUniversalAdapter
+{
+public:
+  using TransformKernel = GetUnderlyingKernel_t<TransformKernel_>;
+  using Arguments = typename TransformKernel::Arguments;
+  using Params = typename TransformKernel::Params;
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+private:
+  /// Kernel API parameters object
+  Params params_;
+public:
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    return TransformKernel::can_implement(args);
+  }
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += TransformKernel::get_workspace_size(args);
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+    return workspace_bytes;
+  }
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = TransformKernel::to_underlying_arguments(args, workspace);
+    return TransformKernel::get_grid_shape(tmp_params);
+  }
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return TransformKernel::get_grid_shape(params);
+  }
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null")
+      << ", EnableCudaHostAdapter: " << (kEnableCudaHostAdapter ? "True" : "false"));
+    // Initialize the workspace
+    Status status = TransformKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = TransformKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = TransformKernel::SharedStorageSize;
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<TransformKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      int32_t kernel_index = 0,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::run()");
+    dim3 const block = TransformKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+    // configure smem size and carveout
+    int smem_size = TransformKernel::SharedStorageSize;
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (TransformKernel::ArchTag::kMinComputeCapability >= 90) {
+      // Currently only support 1x1x1 for transform kernel.
+      dim3 const cluster = {1,1,1};
+      void* kernel_params[] = {&params};
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "TransformUniversalAdapter::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster,
+                                               block,
+                                               smem_size,
+                                               stream,
+                                               kernel_params,
+                                               kernel_index);
+          CUTLASS_TRACE_HOST("Kernel Launch Result" << cutlassGetStatusString(launch_result));
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<TransformKernel>;
+        if constexpr (TransformKernel::ArchTag::kMinComputeCapability == 90) {
+          launch_result = ClusterLauncher::launch(
+            grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        cutlass::kernel_launch<TransformKernel>(grid, block, smem_size, stream, params, launch_with_pdl);
+      }
+    }
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else if (cudaSuccess != result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cudaGetErrorString(result));
+    }
+    else if (Status::kSuccess != launch_result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cutlassGetStatusString(launch_result));
+    }
+    return Status::kErrorInternal;
+  }
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index, launch_with_pdl);
+    }
+    return status;
+  }
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace cutlass::transform::device
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp ADDED Viewed

	@@ -0,0 +1,223 @@

+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Convolution filter format transformation kernel.
+*/
+#pragma once
+#include <algorithm>
+#include <random>
+#include "cutlass/coord.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cute/int_tuple.hpp"
+#include "cute/tensor.hpp"
+#include "cute/config.hpp"
+namespace cutlass::transform::kernel {
+using namespace cute;
+enum class FilterFormat {
+  CKTRS,
+  CTRSK,
+  KTRSC
+};
+template <
+  FilterFormat SrcFormat,
+  FilterFormat DstFormat,
+  int NumDimensions,
+  class Element_,
+  int AlignmentBytes = 16
+>
+struct ConvFilterFormatTransformer {
+  using Element = Element_;
+  static_assert(SrcFormat == FilterFormat::CKTRS, "Currently only source format of CKTRS is supported");
+  static_assert(DstFormat == FilterFormat::CTRSK || DstFormat == FilterFormat::KTRSC, "Currently only destination format of CTRSK/KTRSC is supported");
+  static_assert(AlignmentBytes > 0 && AlignmentBytes % static_cast<int>(sizeof(Element)) == 0, "Invalid alignment setting");
+  // In ktrsc order.
+  using FilterExtent = array<int, NumDimensions>;
+  // Default cta tile shape: 32x32
+  static constexpr auto CTATileShape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+  // Default thread layout: (4, 32)
+  static constexpr auto ThreadLayout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+  static constexpr uint32_t MaxThreadsPerBlock = 128;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  using ArchTag = arch::Sm90;
+  // Default ctor
+  CUTLASS_HOST_DEVICE
+  ConvFilterFormatTransformer() {}
+  struct Arguments {
+    const void *src_ptr;
+    void *dst_ptr;
+    FilterExtent filter_extent;
+  };
+  struct Params {
+    using TensorSrc = decltype(make_tensor(make_gmem_ptr(recast_ptr<const Element>(nullptr)), make_layout(take<0,NumDimensions>(FilterExtent{}))));
+    using TensorDst = decltype(make_tensor(make_gmem_ptr(recast_ptr<Element>(nullptr)), make_layout(make_shape(int32_t(0), int32_t(0)))));
+    TensorSrc src;
+    TensorDst dst;
+  };
+  struct SharedStorage {
+    /* empty, no smem needed */
+  };
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static Status
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    // alignment rule
+    {
+      int contiguous_dim = DstFormat == FilterFormat::CTRSK ? args.filter_extent[0] : args.filter_extent[NumDimensions - 1];
+      int align_element = AlignmentBytes / static_cast<int>(sizeof(Element));
+      implementable &= (contiguous_dim % align_element == 0);
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Alignment setting is invalid.\n");
+        return Status::kInvalid;
+      }
+    }
+    return Status::kSuccess;
+  }
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+  static dim3
+  get_block_shape() {
+    return dim3(size(shape(ThreadLayout)), 1, 1);
+  }
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto dim_m = ceil_div(size<0>(shape(params.dst)), get<0>(CTATileShape));
+    auto dim_n = ceil_div(size<1>(shape(params.dst)), get<1>(CTATileShape));
+    return dim3(dim_m, dim_n, 1);
+  }
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    auto k = args.filter_extent[0];
+    auto c = args.filter_extent[NumDimensions - 1];
+    auto srt = reverse(take<1,NumDimensions - 1>(args.filter_extent));
+    // source shape (s,r,t,k,c)
+    auto shape_src = flatten(make_shape(srt, k, c));
+    auto shape_dst = DstFormat == FilterFormat::CTRSK ? make_shape(k, c * product(srt)) : make_shape(c, k * product(srt));
+    auto src = make_tensor(make_gmem_ptr(recast_ptr<const Element>(args.src_ptr)), make_layout(shape_src));
+    auto dst = make_tensor(make_gmem_ptr(recast_ptr<Element>(args.dst_ptr)), make_layout(shape_dst));
+    return Params{src, dst};
+  }
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char *smem_buf) {
+    // Tile the input tensor into blocks
+    auto block_coord = make_coord(blockIdx.x, blockIdx.y);
+    auto block_shape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+    // Default thread layout: (4, 32)
+    auto thread_layout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+    auto vec_layout = make_layout(make_shape(Int<AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<1>{}));
+    Tensor tile_D = local_tile(params.dst, block_shape, block_coord);
+    // Construct tiled copy
+    using AccessType = cutlass::AlignedArray<Element, size(vec_layout)>;
+    using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;
+    auto tiled_copy = make_tiled_copy(Atom{}, thread_layout, vec_layout);
+    auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
+    Tensor thr_tile_D = thr_copy.partition_D(tile_D);
+    // shape (s, r, t)
+    auto shape_trs = take<0, NumDimensions - 2>(shape(params.src));
+    // strided_c = c for format CTRSK, strided_c = k for format KTRSC
+    auto strided_c = DstFormat == FilterFormat::CTRSK ? get<NumDimensions - 1>(shape(params.src)) : get<NumDimensions - 2>(shape(params.src));
+    // shape (s, r, t, c) for format CTRSK and shape (s, r, t, k) for format KTRSC
+    auto shape_ctrs = append<NumDimensions - 1>(shape_trs, strided_c);
+    auto srtc_coord = idx2crd(int(blockIdx.y * get<1>(block_shape) + threadIdx.x / size<0>(thread_layout)), shape_ctrs);
+    // index of k for format CTRSK and index of c for format KTRSC
+    auto n_layout = make_layout(make_shape(gridDim.x, size<0>(thread_layout)), make_stride(size<0>(block_shape), size<0>(vec_layout)));
+    int n_idx = n_layout(make_coord(blockIdx.x, threadIdx.x % size<0>(thread_layout)));
+    // Fragment to load from S and store to D
+    auto frag = make_fragment_like(thr_tile_D);
+    // Predicate tensor.
+    Tensor thr_tile_P = make_tensor<bool>(shape(thr_tile_D));
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(frag); ++i) {
+      auto srt_coord = take<0, NumDimensions - 2>(srtc_coord);
+      auto kc_coord = DstFormat == FilterFormat::CTRSK ?
+          make_coord(n_idx+i, get<NumDimensions - 2>(srtc_coord)) :
+          make_coord(get<NumDimensions - 2>(srtc_coord), n_idx+i);
+      auto coord = flatten(make_coord(srt_coord, kc_coord));
+      thr_tile_P(i) = elem_less(coord, shape(params.src));
+      if (thr_tile_P(i)) {
+        frag(i) = params.src(coord);
+      }
+    }
+    // Copy from RMEM to GMEM
+    copy_if(tiled_copy, thr_tile_P, frag, thr_tile_D);
+  }
+};
+} // namespace cutlass::transform::kernel

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp ADDED Viewed

	@@ -0,0 +1,603 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Compress utils specific for SM90 structure sparse kernels
+*/
+#pragma once
+#include "cute/container/bit_field.hpp"    // cute::bit_field
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v, cute::uint_bit_t
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
+#include "cute/algorithm/cooperative_copy.hpp" // cute::cooperative_copy
+#include "cutlass/arch/arch.h"             // cutlass::arch::Sm90
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+#include "cutlass/cutlass.h"               // cutlass::Status
+#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/kernel_hardware_info.h"  // cutlass::KernelHardwareInfo
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+#include "cutlass/numeric_types.h"         // cutlass::has_negative_zero_v
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+namespace cutlass::transform::kernel {
+using namespace cute;
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class SM90StructuredSparseCompressor {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+  // * EltA
+  using ElementA = ElementA_;
+  using ElementAUint = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using ElementAMma = typename SparseConfig::ElementAMma;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementAMmaRawUnit = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+  using ElementAUintCompressed = cute::sparse_elem<ElementASparsity{}, ElementAUint>;
+  using LayoutATag = LayoutATag_;
+  using LayoutA = LayoutATag;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+  // * EltE
+  using ElementEMma = typename SparseConfig::ElementEMma;
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+  // Data Type for storing one chunk's metadata
+  static constexpr int ElementEBitsPerChunk = typename SparseConfig::ElementEBitsPerChunk{};
+  CUTE_STATIC_ASSERT(ElementEBitsPerChunk == 4, "ElementEBitsPerChunk is 4 for SM90");
+  using ElementEChunk = cute::uint_bit_t<ElementEBitsPerChunk>;
+  CUTE_STATIC_ASSERT(cute::is_same_v<ElementEChunk, cute::uint4_t>, "ElementEChunk is uint4_t for SM90");
+  using ElementESparsityPerChunk = Int<ElementEMmaSparsity{} / (cute::sizeof_bits_v<ElementEMmaRaw> / ElementEBitsPerChunk)>;
+  // AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  // * Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+  // Required by `device_kernel`
+  static constexpr int MaxThreadsPerBlock = TensorEAtomM{};
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  using ArchTag = arch::Sm90;
+  struct SharedStorage {
+    ElementEMma cEsE[cute::size(TensorEAtom{})];
+    ElementAUintCompressed cACsAC[cute::size(TensorEAtom{})];
+    ElementAUint cAsA[cute::size(TensorEAtom{})];
+  };
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  struct TransformArguments {
+    void const* ptr_A{nullptr};
+    StrideA dA{};
+    void* ptr_ACompress{nullptr};
+    void* ptr_E{nullptr};
+  };
+  using TransformParams = TransformArguments;
+  struct Arguments {
+    ProblemShape problem_shape{};
+    TransformArguments transform{};
+    KernelHardwareInfo hw_info{};
+  };
+  struct Params {
+    ProblemShape problem_shape{};
+    TransformParams transform{};
+    KernelHardwareInfo hw_info{};
+    void* workspace = nullptr;
+  };
+public:
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::to_underlying_arguments()");
+    return Params{{args.problem_shape},
+                  {args.transform.ptr_A, args.transform.dA, args.transform.ptr_ACompress, args.transform.ptr_E},
+                  {args.hw_info},
+                  workspace};
+  }
+  static Status
+  can_implement(Arguments const& args) {
+    auto [M, N, K, L] = args.problem_shape;
+    if (K % LogicalElemsAPerChunk != 0) {
+      CUTLASS_TRACE_HOST("SM90 Sparse Compressor CAN NOT IMPLEMENT: GemmK not multiplier of logical chunk size");
+      return Status::kErrorInvalidProblem;
+    }
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::can_implement() (True)");
+    return Status::kSuccess;
+  }
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    CUTLASS_UNUSED(args);
+    // Backward compatible with host compressor
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_workspace_size() (" << SharedStorageSize << ")");
+    return SharedStorageSize;
+  }
+  static Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    CUTLASS_UNUSED(args);
+    CUTLASS_UNUSED(workspace);
+    CUTLASS_UNUSED(stream);
+    CUTLASS_UNUSED(cuda_adapter);
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::initialize_workspace()");
+    return Status::kSuccess;
+  }
+  static dim3
+  get_grid_shape(Params const& params) {
+    constexpr int MaxAlignmentM = cutlass::const_max(TensorEAlignmentM, TensorAAlignmentM);
+    constexpr int MaxAlignmentK = cutlass::const_max(TensorEAlignmentK, TensorAAlignmentK);
+    const auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+    const int GemmMAlignedMax = cutlass::round_up(GemmM, MaxAlignmentM);
+    const int GemmKAlignedMax = cutlass::round_up(GemmK, MaxAlignmentK);
+    const int gridDim_X = cutlass::ceil_div(GemmMAlignedMax, TensorEAtomM{});
+    const int gridDim_Y = cutlass::ceil_div(GemmKAlignedMax, TensorEAtomK{});
+    const int gridDim_Z = GemmL;
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_grid_shape() ("
+      << gridDim_X << ", "
+      << gridDim_Y << ", "
+      << gridDim_Z << ")");
+    return dim3(gridDim_X, gridDim_Y, gridDim_Z);
+  }
+  static dim3
+  get_block_shape() {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_block_shape() ("
+      << MaxThreadsPerBlock << ", "
+      << 1 << ", "
+      << 1 << ")");
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+  CUTE_DEVICE
+  void
+  operator()(Params params, void* smem_buf = nullptr) {
+    run(params, smem_buf);
+  }
+  CUTE_DEVICE
+  static void
+  run(Params params, void* smem_buf = nullptr) {
+    structure_sparse_compress(params, smem_buf);
+  }
+private:
+  struct MetadataOneChunk1to2 {
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 2);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b0100;
+          case 1:
+            return 0b1110;
+          default:
+            CUTE_GCC_UNREACHABLE;
+        }
+      };
+      storage_ |= (metadata_bits() << (4 * elt_phy_idx));
+    }
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+  struct MetadataOneChunk2to4{
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 4);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b00;
+          case 1:
+            return 0b01;
+          case 2:
+            return 0b10;
+          case 3:
+            return 0b11;
+          default:
+            CUTLASS_ASSERT(false);
+            CUTE_GCC_UNREACHABLE;
+            return 0b00;
+        }
+      };
+      storage_ |= (metadata_bits() << (2 * elt_phy_idx));
+    }
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+  using MetadataOneChunk = cute::conditional_t<SparseConfig::IsTF32,
+                                               MetadataOneChunk1to2,
+                                               MetadataOneChunk2to4>;
+private:
+  CUTE_DEVICE
+  static void
+  structure_sparse_compress(Params params, void* smem_buf) {
+    // * Input Params
+    auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+    auto [ptr_A, dA, ptr_ACompress, ptr_E] = params.transform;
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    [[maybe_unused]] const int gridDim_X = gridDim.x;
+    [[maybe_unused]] const int gridDim_Y = gridDim.y;
+    [[maybe_unused]] const int gridDim_Z = gridDim.z;
+    [[maybe_unused]] const int blockDim_X = blockDim.x;
+    // * Global Tensor Layout
+    const cute::Layout layout_gA = make_layout(make_shape(GemmM, GemmK, GemmL), dA);
+    const cute::Layout layout_gAC = SparseConfig::fill_layoutA(params.problem_shape);
+    const cute::Layout layout_gE = SparseConfig::fill_layoutE(params.problem_shape);
+    // * Construct Global Tensor
+    const cute::Tensor gA   = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUint>(ptr_A)), layout_gA);
+    cute::Tensor gAC_sparse = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUintCompressed>(ptr_ACompress)), layout_gAC );
+    cute::Tensor gAC        = cute::recast<ElementAUint>(gAC_sparse);
+    cute::Tensor gE_sparse  = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementEMma>(ptr_E)), layout_gE);
+    cute::Tensor gE         = cute::recast<ElementEMmaRaw>(gE_sparse);
+    // * CTA Tensor Layout
+    using cAsA_layout_row = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutRight{}));
+    using cAsA_layout_col = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutLeft{}));
+    using cAsA_layout     = cute::conditional_t<cute::is_same_v<LayoutATag, layout::RowMajor>, cAsA_layout_row, cAsA_layout_col>;
+    using cACsAC_layout   = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementASparsity{}), LayoutRight{}));
+    using cEsE_layout     = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementEMmaSparsity{}), LayoutRight{}));
+    CUTE_STATIC_ASSERT(cute::is_static_v<TensorEAtom>, "TensorEAtom needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cAsA_layout>, "cAsA_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cACsAC_layout>, "cACsAC_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cEsE_layout>, "cEsE_layout needs to be static");
+    const int blockIdx_X = blockIdx.x;
+    const int blockIdx_Y = blockIdx.y;
+    const int blockIdx_Z = blockIdx.z;
+    const int threadIdx_X = threadIdx.x;
+    // * Construct CTA Tensor
+    const auto cta_coord = make_coord(blockIdx_X, blockIdx_Y, blockIdx_Z);
+    cute::Tensor cAgA   = cute::recast<ElementAMmaRawUnit>(local_tile(gA, shape(cAsA_layout{}), cta_coord));
+    cute::Tensor cACgAC = cute::recast<ElementAMmaRawUnit>(local_tile(gAC, shape(cACsAC_layout{}), cta_coord));
+    cute::Tensor cEgE   = local_tile(gE, shape(cEsE_layout{}), cta_coord);
+    cute::Tensor cAsA   = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cAsA)), cAsA_layout{}));
+    cute::Tensor cACsAC = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cACsAC)), cACsAC_layout{}));
+    cute::Tensor cEsE   = make_tensor(make_smem_ptr(cute::recast_ptr<ElementEMmaRaw>(shared_storage.cEsE)), cEsE_layout{});
+    cute::Tensor cEsE_chunk = cute::recast<ElementEChunk>(cEsE);
+    // * Handle in unit of Chunk when compress
+    using OneChunkSizeA  = Int<LogicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeAC = Int<PhysicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeE  = Int<LogicalElemsAPerChunk / ElementESparsityPerChunk{}>;
+    using NumOneChunkK   = Int<cutlass::ceil_div(TensorEAtomK{}, LogicalElemsAPerChunk)>;
+    cute::Tensor cAsA_log_chunk   = logical_divide(cAsA, make_shape(_, OneChunkSizeA{}));
+    cute::Tensor cACsAC_log_chunk = logical_divide(cACsAC, make_shape(_, OneChunkSizeAC{}));
+    cute::Tensor cEsE_log_chunk   = logical_divide(cEsE_chunk, make_shape(_, OneChunkSizeE{}));
+    // * Corner Case Handle
+    const auto GemmM_within_Cta = (GemmM - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmM - blockIdx_X * TensorEAtomM{};
+    const auto GemmK_within_Cta = ( (GemmK - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmK - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+    const auto GemmK_NumOneChunk_within_Cta = GemmK_within_Cta / LogicalElemsAMmaRawPerChunk;
+    const auto GemmMAlignedAC = cutlass::round_up(GemmM, TensorAAlignmentM);
+    const auto GemmKAlignedAC = cutlass::round_up(GemmK, TensorAAlignmentK);
+    const auto GemmMAlignedAC_within_Cta = (GemmMAlignedAC - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmMAlignedAC - blockIdx_X * TensorEAtomM{};
+    const auto GemmKAlignedAC_within_Cta = ( (GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+    // * Clear CTA Smem Tensor
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cACsAC);
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cEsE);
+    // * Input CTA Tensor G to S
+    if (GemmM_within_Cta == TensorEAtomM{} && GemmK_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+    // Construct a sign bit mask for handling negative zeros
+    ElementAMmaRawUnit sign_mask = ElementAMmaRawUnit{ 0 };
+    if constexpr (has_negative_zero_v<ElementA>) {
+      ElementAMmaRawUnit one_sign_mask = static_cast<ElementAMmaRawUnit>(~(ElementAMmaRawUnit{ 1 } << (cute::sizeof_bits_v<ElementA> - 1)));
+      for (int i = 0; i < sizeof(ElementAMmaRawUnit) / sizeof(ElementAUint); ++i) {
+        sign_mask = static_cast<ElementAMmaRawUnit>((int32_t)sign_mask | (int32_t)one_sign_mask << (i * cute::sizeof_bits_v<ElementA>));
+      }
+    }
+    // * Compress
+    // cACsAC is always row major order
+    // TensorEAtomM threads perform the compression, each thread compress one row
+    const int row_i = threadIdx_X;
+    if (row_i < GemmM_within_Cta) {
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < NumOneChunkK{}; ++col_chunk_i) {
+        if (col_chunk_i < GemmK_NumOneChunk_within_Cta) {
+          // Compress is handled in unit of ElementAMmaRawUnit
+          cute::Tensor tAsA   = cAsA_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tACsAC = cACsAC_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tEsE   = cEsE_log_chunk(row_i, make_coord(_, col_chunk_i));
+          int non_zero_cnt = 0;
+          // None zero element indx
+          // e.g.
+          //  2:4 sparsity [x 0 0 x]
+          //  non_zero_elt_log_idx = [0, 3]
+          int non_zero_elt_log_idx[OneChunkSizeAC{}] = { 0 };
+          // * Find None Zero Element Idx within Chunk
+          CUTE_UNROLL
+          for (int elt_log_idx = 0; elt_log_idx < OneChunkSizeA{}; ++elt_log_idx) {
+            ElementAMmaRawUnit elem_A = tAsA[elt_log_idx];
+            // Handle negative 0
+            ElementAMmaRawUnit masked_elem_A = elem_A;
+            if constexpr (has_negative_zero_v<ElementA>) {
+              masked_elem_A = elem_A & sign_mask;
+            }
+            if (masked_elem_A != ElementAMmaRawUnit{0}) {
+              non_zero_elt_log_idx[non_zero_cnt] = elt_log_idx;
+              tACsAC[non_zero_cnt] = elem_A;
+              non_zero_cnt++;
+            }
+          }
+          // * Corner Case for 2:4 sparsity
+          if constexpr (cute::sizeof_bits_v<ElementAMmaRawUnit> < 32) {
+            // i.e. [0 0 0 x] -> [(0) 0 0 x]
+            if (non_zero_cnt == 1 && non_zero_elt_log_idx[0] == 3) {
+              tACsAC[1] = tACsAC[0];
+              tACsAC[0] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[0] = 0;
+              non_zero_elt_log_idx[1] = 3;
+            }
+            // i.e. [0 0 x 0] -> [0 0 x (0)]
+            // i.e. [0 x 0 0] -> [0 x 0 (0)]
+            // i.e. [x 0 0 0] -> [x 0 0 (0)]
+            else if (non_zero_cnt == 1) {
+              tACsAC[1] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[1] = 3;
+            }
+          }
+          // * Set Metadata Bits
+          MetadataOneChunk metadata_one_chunk;
+          CUTE_UNROLL
+          for (int elt_phy_idx = 0; elt_phy_idx < OneChunkSizeAC{}; elt_phy_idx++) {
+            metadata_one_chunk.set_metadata_bits(non_zero_elt_log_idx[elt_phy_idx], elt_phy_idx);
+          }
+          tEsE[0] = metadata_one_chunk.storage();
+        }
+        else {
+          break;
+        }
+      }
+    }
+    // * Sync after Compress
+    __syncthreads();
+    // * Output Cta Tensor S to G
+    if (GemmM_within_Cta > 0 && GemmK_within_Cta > 0) {
+      constexpr int MaxVecBits = 128; // STG.128
+      cute::cooperative_copy<MaxThreadsPerBlock, MaxVecBits>(threadIdx_X, cEsE, cEgE);
+    }
+    if (GemmMAlignedAC_within_Cta == TensorEAtomM{} && GemmKAlignedAC_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+  } // end of structure_sparse_compress()
+  template<uint32_t NumThreads,
+           typename TensorSrc>
+  CUTE_DEVICE
+  static void
+  cooperative_clear(
+    uint32_t const& tid,
+    TensorSrc dSrc) {
+    auto dSrctSrc = local_partition(dSrc, make_layout(make_shape(NumThreads, _1{})), tid);
+    cute::clear(dSrctSrc);
+    // Sync all thread data access
+    __syncthreads();
+  }
+  template <bool pred,
+            typename LayoutTag,
+            typename TensorSrc,
+            typename TensorDst>
+  CUTE_DEVICE
+  static void
+  copy_vec_pred(
+      TensorSrc dSrc,
+      TensorDst dDst,
+      int threadIdx_X,
+      int valid_rows,
+      int valid_cols) {
+    constexpr bool IsRowMajor = cute::is_same_v<LayoutTag, cutlass::layout::RowMajor>;
+    using Element = typename TensorSrc::element_type;
+    constexpr bool IsQmmaF6 = cute::sizeof_bits_v<Element> == 6;
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dSrc))>, "shape(dSrc) needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dDst))>, "shape(dDst) needs to be static");
+    CUTE_STATIC_ASSERT(cute::sizeof_bits_v<typename TensorSrc::element_type> == cute::sizeof_bits_v<typename TensorDst::element_type>,
+      "dSrc and dDst need to have same element bit width");
+    CUTE_STATIC_ASSERT(cute::size(dSrc) == cute::size(dDst), "dSrc and dDst need to have same size");
+    // ValueShape
+    using ValueShape =
+      cute::conditional_t<IsQmmaF6,
+                          Shape<Int<1>, Int<1>>,
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<1>, Int<128 / sizeof_bits_v<Element>>>,
+                          Shape<Int<128 / sizeof_bits_v<Element>>, Int<1>>>
+      >;
+    constexpr int ValueShapeRows = shape<0>(ValueShape{});
+    constexpr int ValueShapeCols = shape<1>(ValueShape{});
+    // ThreadShape
+    using ThreadShape =
+      cute::conditional_t<IsQmmaF6,
+                          cute::conditional_t<IsRowMajor,
+                                              Shape<Int<MaxThreadsPerBlock>, Int<1>>,
+                                              Shape<Int<1>, Int<MaxThreadsPerBlock>>>,
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<MaxThreadsPerBlock / (shape<1>(dSrc) / ValueShapeCols)>, Int<                     (shape<1>(dSrc) / ValueShapeCols)>>,
+                          Shape<Int<                     (shape<0>(dSrc) / ValueShapeRows)>, Int<MaxThreadsPerBlock / (shape<0>(dSrc) / ValueShapeRows)>>>
+      >;
+    constexpr int ThreadShapeRows = shape<0>(ThreadShape{});
+    constexpr int ThreadShapeCols = shape<1>(ThreadShape{});
+    const int threadIdx_X_row = threadIdx_X / ThreadShapeCols;
+    const int threadIdx_X_col = threadIdx_X % ThreadShapeCols;
+    // Row Major
+    if constexpr (IsRowMajor) {
+      CUTE_UNROLL
+      for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+        CUTE_UNROLL
+        for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+          CUTE_UNROLL
+          for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+            CUTE_UNROLL
+            for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred) and (not IsQmmaF6) ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Col Major
+    else {
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+        CUTE_UNROLL
+        for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+          CUTE_UNROLL
+          for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+            CUTE_UNROLL
+            for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred) and (not IsQmmaF6) ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Sync all thread data access
+    __syncthreads();
+  } // end of copy_vec_pred()
+};
+}  // namespace cutlass::transform::kernel

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp ADDED Viewed

	@@ -0,0 +1,325 @@

+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Compress utils for structured sparse kernels
+*/
+#pragma once
+#include <algorithm>                           // std::fill
+#include <array>                               // std::array
+#include <random>                              // std::mt19937
+#include "cute/numeric/numeric_types.hpp"      // cute::sizeof_bits_v
+#include "cute/tensor.hpp"                     // cute::Tensor, cute::make_tensor
+#include "cutlass/arch/arch.h"                 // cutlass::arch::SmXY
+#include "cutlass/detail/dependent_false.hpp"  // cutlass::detail::dependent_false
+#include "cutlass/gemm/gemm.h"                 // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"                 // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/numeric_size.h"              // cutlass::bits_to_bytes
+#include "cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp"
+namespace cutlass::transform::kernel {
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class StructuredSparseCompressorUtility {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+  //* EltA
+  using ElementA = ElementA_;
+  using LayoutATag = LayoutATag_;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+  //* EltE
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+  //* AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  //* Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+  StructuredSparseCompressorUtility() = default;
+  StructuredSparseCompressorUtility(ProblemShape problem, StrideA dA) {
+    set_problem_size(problem, dA);
+  }
+  void set_problem_size(ProblemShape problem, StrideA dA_) {
+    M = cute::size<0>(problem);
+    K = cute::size<2>(problem);
+    L = cute::size<3>(problem);
+    // The following three vars are logical elem count!
+    K_alignedA  = round_up(K, TensorAAlignmentK);
+    M_alignedA  = round_up(M, TensorAAlignmentM);
+    K_alignedE = round_up(K, TensorEAlignmentK);
+    M_alignedE = round_up(M, TensorEAlignmentM);
+    dA = dA_;
+  }
+  /**
+   * @brief Get the TensorE number of ElementE along K after alignment requirement
+   *
+   * @return int : number of ElementE (uint8_t) along K-dim
+   */
+  int get_metadata_m_physical() const {
+    return M_alignedE;
+  }
+  /**
+   * @brief Get the TensorE number of ElementE along M after alignment requirement
+   *
+   * @return int : number of ElementE (uint8_t) along M-dim
+   */
+  int get_metadata_k_physical() const {
+    return K_alignedE / ElementEMmaSparsity{};
+  }
+  /**
+   * @brief Get the TensorACompressed number of ElementA along K after alignment requirement
+   *
+   * @return int : number of ElementA along K-dim
+   */
+  int get_tensorA_k_physical() const {
+    return K_alignedA / ElementASparsity{};
+  }
+  /**
+   * @brief Get the TensorACompressed number of ElementA along M after alignment requirement
+   *
+   * @return int : number of ElementA along M-dim
+   */
+  int get_tensorA_m_physical() const {
+    return M_alignedA;
+  }
+  /**
+   * @brief Get the TensorACompressed Bytes
+   *
+   * @return uint64_t bytes
+   */
+  uint64_t get_compressed_tensor_A_bytes() const {
+    const auto tensor_a_comp_num_elt_a = get_tensorA_m_physical() * get_tensorA_k_physical() * L;
+    const auto tensor_a_comp_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_comp_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_comp_bytes;
+  }
+  /**
+   * @brief Get the TensorA Bytes
+   *
+   * @return uint64_t bytes
+   */
+  uint64_t get_raw_tensor_A_bytes() const {
+    const auto tensor_a_num_elt_a = uint64_t(M) * uint64_t(K) * uint64_t(L);
+    const auto tensor_a_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_bytes;
+  }
+  /**
+   * @brief Get the TensorE Bytes
+   *
+   * @return uint64_t bytes
+   */
+  uint64_t get_tensor_E_bytes() const {
+    const auto tensor_e_num_elt_a = uint64_t(get_metadata_m_physical()) * uint64_t(get_metadata_k_physical()) * uint64_t(L);
+    const auto tensor_e_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_e_num_elt_a * cute::sizeof_bits_v<ElementEMmaRaw>);
+    return tensor_e_bytes;
+  }
+  constexpr auto fill_layoutA_from_compressor() const {
+    return SparseConfig::fill_layoutA(cute::make_tuple(M,_1{},K,L));
+  }
+  constexpr auto fill_layoutE_from_compressor() const {
+    return SparseConfig::fill_layoutE(cute::make_tuple(M,_1{},K,L));
+  }
+  void structure_sparse_zero_mask_fill(void* host_a_ptr, uint64_t seed) {
+    constexpr int ChunkSize = LogicalElemsAMmaRawPerChunk;
+    using ChunkElement = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+    cute::Tensor gA_eltA = cute::make_tensor(
+        cute::recast_ptr<ElementA>(host_a_ptr),
+        cute::make_layout(make_shape(M, K, L), dA));
+    // Input TensorA is handled in unit of ElementAMmaRaw instead of ElementA
+    cute::Tensor gA = cute::recast<ChunkElement>(gA_eltA);
+    // Extract out the Chunk from K-mode
+    Tensor gA_chunk = cute::zipped_divide(gA, cute::Shape<_1,cute::Int<ChunkSize>>{}); // (Chunk, Rest)
+    // Half of the data is zero to indicate sparsityA = 2
+    std::array<int, ChunkSize> nnzb_indicator{};
+    for (size_t i = 1; i < nnzb_indicator.size(); i += 2) {
+      nnzb_indicator.at(i) = 1;
+    }
+    std::mt19937 rng(seed);
+    auto rest_shape = cute::shape<1>(gA_chunk);
+    for (auto iter = cute::make_coord_iterator(rest_shape); iter != cute::ForwardCoordIteratorSentinel{}; ++iter) {
+      std::shuffle(nnzb_indicator.begin(), nnzb_indicator.end(), rng);
+      for (int c = 0; c < size<0>(gA_chunk); ++c) {                        // for each elem within chunk
+        if (nnzb_indicator[c] == 0) {
+          gA_chunk(c, *iter) = ChunkElement{0};
+        }
+      }  // end of within chunk
+    }    // end of chunk_idx
+  }
+  int M{-1};
+  int K{-1};
+  int L{-1};
+  StrideA dA{};
+private:
+  int K_alignedA{-1};
+  int M_alignedA{-1};
+  int K_alignedE{-1};
+  int M_alignedE{-1};
+};
+////////////////////////////////////////////////////////////////////////////////
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+struct StructuredSparseCompressorSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not select a structured sparse compressor for given parameters.");
+};
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm90> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm100> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm120> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+using StructuredSparseCompressor = typename StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    ArchTag
+>::Compressor;
+} // End namespace cutlass::transform::kernel

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/pitch_linear_thread_map.h ADDED Viewed

	@@ -0,0 +1,926 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing how threads are mapped to a given tile.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+////////////////////////////////////////////////////////////////////////////////
+/// Strip-mines a pitch-linear tile among a given number of threads, first along
+/// the contiguous dimension then along the strided dimension.
+///
+/// The tile must be divisible by the thread count such that all threads may
+/// execute the same number of iterations with the same delta to exhaustively
+/// cover the tile.
+///
+/// This class satisfies the "RegularThreadMapping" concept.
+///
+/// This ThreadMap is used by SIMT kernels and operand E of the sparse tensor
+/// kernels.
+template <
+  typename Shape_,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearStripminedThreadMap {
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+  /// Tile shape
+  using Shape = Shape_;
+  /// Number of threads total
+  static int const kThreads = Threads;
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+  /// Internal implementation details
+  struct Detail {
+    static_assert(!(Shape::kContiguous % kElementsPerAccess), "");
+    /// Shape of the tile in units of vectors
+    using ShapeVec = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+    static_assert((Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
+                      (!(kThreads % ShapeVec::kContiguous)),
+                  "Shape must be divisible by number of iterations of each thread.");
+  };
+  /// Number of iterations by each thread
+  using Iterations = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<
+          1,
+          // Redo the comparison here to work around divide by zero compiler
+          // error.  The compiler evaluates both path of platform::conditional.
+          (Threads >= Detail::ShapeVec::kContiguous
+               ? (Detail::ShapeVec::kStrided + (kThreads / Detail::ShapeVec::kContiguous - 1)) /
+                     (kThreads / Detail::ShapeVec::kContiguous)
+               : 0)>,
+      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
+                               Detail::ShapeVec::kStrided>>::type;
+  /// Interval between accesses along each dimension of the tensor's logical coordinate space
+  /// (in units of Elements)
+  using Delta = typename platform::conditional<
+    Threads >= Detail::ShapeVec::kContiguous,
+    layout::PitchLinearShape<
+      1,
+      kThreads / Detail::ShapeVec::kContiguous
+    >,
+    layout::PitchLinearShape<
+      kThreads * kElementsPerAccess,
+      1
+    >
+  >::type;
+  /// Shape of the tile in units of vectors
+  using StorageShape = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<Shape::kContiguous,
+                               Iterations::kStrided*(kThreads / Detail::ShapeVec::kContiguous)>,
+      layout::PitchLinearShape<Shape::kContiguous, Shape::kStrided>>::type;
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  /// (in units of Elements)
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    return TensorCoord(
+      (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess,
+      thread_id / Detail::ShapeVec::kContiguous);
+  }
+};
+/// This ThreadMap is used by GEMV
+template <
+  typename Shape,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearTilePolicyStripminedThreadContiguous
+{
+ static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,
+              "Contiguous shape must divide number of threads");
+  using TensorCoord = layout::PitchLinearCoord;
+  static int const kThreads = Threads;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Iterations = layout::PitchLinearShape<
+                      Shape::kContiguous / (kThreads * kElementsPerAccess),
+                      Shape::kStrided>;
+  using Delta = layout::PitchLinearShape<1, 1>;
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id)
+  {
+    return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);
+  }
+};
+template <
+  typename Shape,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearTilePolicyStripminedThreadStrided
+{
+  static_assert((Shape::kStrided % Threads == 0),
+                "Strided shape must divide number of threads");
+  using TensorCoord = layout::PitchLinearCoord;
+  static int const kThreads = Threads;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Iterations = layout::PitchLinearShape<
+                      Shape::kContiguous / kElementsPerAccess,
+                      Shape::kStrided / kThreads>;
+  using Delta = layout::PitchLinearShape<1, 1>;
+  using ShapeVec = Shape;
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id)
+  {
+    return TensorCoord(0, thread_id * Iterations::kStrided);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
+/// elements.
+///
+/// This ThreadMap is used by tensor core kernels.
+template <
+  typename Shape_,
+  int Threads,
+  typename WarpThreadArrangement_,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearWarpRakedThreadMap {
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+  /// Tile shape
+  using Shape = Shape_;
+  /// Number of threads total
+  static int const kThreads = Threads;
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+    static_assert(
+      !(Shape::kContiguous % kElementsPerAccess),
+      "Shape must be divisible by vector length.");
+    /// Compute the 'shape' of the overall tile in units of vectors
+    using ShapeInAccesses = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+    static_assert(
+      !(ShapeInAccesses::kContiguous % WarpThreadArrangement::kContiguous),
+      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
+    static_assert(
+      !(ShapeInAccesses::kStrided % WarpThreadArrangement::kStrided),
+      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
+    // compute number of warp-level accesses total
+    using WarpAccessIterations = layout::PitchLinearShape<
+      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
+      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
+    >;
+    // Divide it into the number of warps, first partitioning the strided dimension then the
+    // contiguous.
+    static int const kWarpsStrided =
+        (WarpAccessIterations::kStrided >= kWarpCount
+             ? kWarpCount
+             : WarpAccessIterations::kStrided);
+    static int const kWarpsContiguous =
+        (kWarpCount > WarpAccessIterations::kStrided
+             ? kWarpCount / kWarpsStrided
+             : 1);
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+  };
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta = layout::PitchLinearShape<
+    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
+    Detail::WarpThreadArrangement::kStrided
+  >;
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+    //
+    // compute warp-level offset
+    //
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
+    };
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
+/// elements. Warps are arranged based on a stride.
+///
+/// This ThreadMap is used by tensor core kernels for NCxHWx layout.
+template <
+  typename Shape_,
+  int Threads,
+  typename WarpThreadArrangement_,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearStridedWarpRakedThreadMap {
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+  /// Tile shape
+  using Shape = Shape_;
+  /// Number of threads total
+  static int const kThreads = Threads;
+  using WarpThreadArrangement = WarpThreadArrangement_;
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+  /// Base ThreadMap
+  using BaseThreadMap = PitchLinearWarpRakedThreadMap<
+    Shape,
+    kThreads,
+    WarpThreadArrangement,
+    kElementsPerAccess
+  >;
+  /// Shape of access by each thread
+  using ThreadAccessShape = typename BaseThreadMap::ThreadAccessShape;
+  struct Detail {
+    using WarpThreadArrangement = WarpThreadArrangement_;
+    using WarpAccessIterations = typename BaseThreadMap::Detail::WarpAccessIterations;
+    static int const kWarpSize = BaseThreadMap::Detail::kWarpSize;
+    static int const kWarpCount = BaseThreadMap::Detail::kWarpCount;
+    using ShapeInAccesses = typename BaseThreadMap::Detail::ShapeInAccesses;
+    // Divide it into the number of warps, first partitioning the contiguous dimension then the
+    // stride.
+    static int const kWarpsContiguous =
+        (WarpAccessIterations::kContiguous >= kWarpCount
+             ? kWarpCount
+             : WarpAccessIterations::kContiguous);
+    static int const kWarpsStrided =
+        (kWarpCount > WarpAccessIterations::kContiguous
+             ? kWarpCount / kWarpsContiguous
+             : 1);
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+  };
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta = typename BaseThreadMap::Delta;
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+    //
+    // compute warp-level offset
+    //
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
+    };
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Transpose the existing ThreadMap.  For example, interleaved layout is like
+/// congruous in the global memory and crosswise in the shared memory.  We need
+/// to transpose the coordinates between two.
+template <typename ThreadMap_, typename WarpThreadArrangement_>
+struct TransposePitchLinearThreadMap {
+  /// Underlying ThreadMap
+  using ThreadMap = ThreadMap_;
+  /// Tensor coordinate
+  using TensorCoord = typename ThreadMap::TensorCoord;
+  /// Tile shape
+  using Shape = typename ThreadMap::Shape;
+  /// Number of threads total
+  static int const kThreads = ThreadMap::kThreads;
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+    static_assert(!(Shape::kContiguous % kElementsPerAccess),
+                  "Shape must be divisible by vector length.");
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement =
+        layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,
+                                 ThreadMap::Detail::kWarpsContiguous>;
+  };
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations =
+      layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+                               ThreadMap::Iterations::kContiguous>;
+  static_assert(Iterations::kContiguous == 1,
+    "Contiguous iteration has to be one to reuse the same shared store function with those that don't need transpose");
+  static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta =
+      layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *
+                                   kElementsPerAccess,
+                               Detail::WarpThreadArrangement::kStrided>;
+  /// Maps thread ID to a coordinate offset within the tensor's logical
+  /// coordinate space Note this is slightly different from the one of
+  /// PitchLinearWarpRakedThreadMap.
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+    //
+    // compute warp-level offset
+    //
+    // This is the shape of the entire area covered by a warp's memory access
+    // (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+        Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+        Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};
+    // This is the offset of a specific warp (in units of vectors)
+    // Note the order of / and %. Also the 2nd operand is kStrided.
+    layout::PitchLinearCoord warp_offset{
+        (warp_id / Detail::WarpArrangement::kStrided),
+        (warp_id % Detail::WarpArrangement::kStrided)};
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+        lane_id % Detail::WarpThreadArrangement::kContiguous,
+        lane_id / Detail::WarpThreadArrangement::kContiguous};
+    // This is the offset of a thread within a threadblock tile (units of
+    // vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+    // This is the offset of a thread within a threadblock tile (units of
+    // elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+        thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+        thread_offset_in_threadblock_tile_vec.strided()};
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+template <typename ThreadMap_>
+struct TransposePitchLinearThreadMapSimt {
+    /// Underlying ThreadMap
+    using ThreadMap = ThreadMap_;
+    /// Tensor coordinate
+    using TensorCoord = typename ThreadMap::TensorCoord;
+    /// Tile shape
+    using Shape = typename ThreadMap::Shape;
+    /// Number of threads total
+    static int const kThreads = ThreadMap::kThreads;
+    /// Extract vector length from Layout
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
+    ///< Iterations along each dimension (concept: PitchLinearShape)
+    using Iterations =
+        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+        ThreadMap::Iterations::kContiguous>;
+    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+    static_assert(Iterations::kStrided == 1,
+      "Strided iteration has to be one to reuse the same shared store function with those that don't need transpose");
+    /// Shape of access by each thread
+    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
+    ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+    using Delta =
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
+        ThreadMap::Delta::kContiguous>;
+    /// Maps thread ID to a coordinate offset within the tensor's logical
+    /// coordinate space Note this is slightly different from the one of
+    /// PitchLinearWarpRakedThreadMap.
+    CUTLASS_HOST_DEVICE
+        static TensorCoord initial_offset(int thread_id) {
+        TensorCoord coord = ThreadMap::initial_offset(thread_id);
+        return TensorCoord(
+            coord.strided(),
+            coord.contiguous()
+        );
+    }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Policy defining a warp-striped arrangement.  This partitions a tile into vectorized memory
+/// accesses performed by each warp then distributes warps across them. Warps are striped in the
+/// strided dimension and raked across the contiguous dimension.
+template <
+  typename Shape_,                          /// Overall shape to partition in units of elements
+  int Threads,                              /// Number of partiticipation threads
+  typename WarpThreadArrangement_,          /// Describes the shape of one memory access per warp
+  int ElementsPerAccess = 1                 /// Number of elements accessed by each thread per memory operation (i.e. vector size)
+>
+struct PitchLinearWarpStripedThreadMap {
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+  /// Tile shape
+  using Shape = Shape_;
+  /// Number of threads total
+  static int const kThreads = Threads;
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+    static_assert(
+      !(Shape::kContiguous % kElementsPerAccess),
+      "Shape must be divisible by vector length.");
+    /// Compute the 'shape' of the overall tile in units of vectors
+    using ShapeInAccesses = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+    // compute number of warp-level accesses total
+    using WarpAccessIterations = layout::PitchLinearShape<
+      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
+      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
+    >;
+    // Divide it into the number of warps, first partitioning the strided dimension then the
+    // contiguous.
+    static int const kWarpsStrided =
+      (WarpAccessIterations::kStrided >= kWarpCount
+        ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));
+    static int const kWarpsContiguous =
+      (kWarpCount > WarpAccessIterations::kStrided ?
+        WarpAccessIterations::kContiguous / kWarpsStrided : 1);
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+  };
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta = layout::PitchLinearShape<
+    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
+    Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided
+  >;
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+    //
+    // compute warp-level offset
+    //
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided
+    };
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Strip-mines a pitch-linear tile among a given number of threads, first along the contiguous
+/// dimension then along the strided dimension, while each thread access a 2D thread-tile.
+///
+/// The tile must be divisible by the thread count such that all threads may execute the same
+/// number of iterations with the same delta to exhaustively cover the tile.
+///
+/// This class satisfies the "RegularThreadMapping" concept.
+template <
+  typename Shape_,
+  int Threads,
+        typename ThreadTileShape
+>
+struct PitchLinear2DThreadTileStripminedThreadMap;
+template <
+  typename Shape_,
+  int Threads
+>
+struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::layout::PitchLinearShape<4, 4>>{
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+  /// Tile shape
+  using Shape = Shape_;
+  /// Access Shape of each thread
+  using ThreadAccessShape = cutlass::layout::PitchLinearShape<4, 4>;
+  //using ThreadAccessShape = ThreadTileShape;
+  /// Number of threads total
+  static int const kThreads = Threads;
+  /// Extract length of each access from Layout
+  static int const kElementsPerAccess = ThreadAccessShape::kContiguous;
+  static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");
+  /// Internal implementation details
+  struct Detail {
+    static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");
+    static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");
+    static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),
+      "Shape must be divisible thread count * accesses per thread.");
+    /// Shape of the tile in units of vectors
+    using ShapeVec = layout::PitchLinearShape<
+      Shape::kContiguous / ThreadAccessShape::kContiguous,
+      Shape::kStrided / ThreadAccessShape::kStrided
+    >;
+    static_assert(
+      (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
+      (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
+      "Shape must be divisible by number of iterations of each thread."
+    );
+  };
+  /// Number of iterations by each thread
+  using Iterations = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<
+          1,
+          // Redo the comparison here to work around divide by zero compiler
+          // error.  The compiler evaluates both path of platform::conditional.
+          (Threads >= Detail::ShapeVec::kContiguous
+               ? Detail::ShapeVec::kStrided /
+                     (kThreads / Detail::ShapeVec::kContiguous)
+               : 0)>,
+      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
+                               Detail::ShapeVec::kStrided>>::type;
+  /// Interval between accesses along each dimension of the tensor's logical coordinate space
+  /// (in units of Elements)
+  using Delta = typename platform::conditional<
+    Threads >= Detail::ShapeVec::kContiguous,
+    layout::PitchLinearShape<
+      Shape::kContiguous,
+      kThreads * ThreadAccessShape::kStrided / Detail::ShapeVec::kContiguous
+    >,
+    layout::PitchLinearShape<
+      kThreads * ThreadAccessShape::kContiguous,
+      1
+    >
+  >::type;
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  /// (in units of Elements)
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    return TensorCoord(
+      (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,
+      (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);
+  }
+};
+/// Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping
+template <typename ThreadMap_>
+struct TransposePitchLinearThreadMap2DThreadTile {
+    /// Underlying ThreadMap
+    using ThreadMap = ThreadMap_;
+    /// Tensor coordinate
+    using TensorCoord = typename ThreadMap::TensorCoord;
+    /// Tile shape
+    using Shape = typename ThreadMap::Shape;
+    /// Number of threads total
+    static int const kThreads = ThreadMap::kThreads;
+    /// Extract vector length from Layout
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+    static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
+    ///< Iterations along each dimension (concept: PitchLinearShape)
+    using Iterations =
+        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+        ThreadMap::Iterations::kContiguous>;
+    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+    /// Shape of access by each thread
+    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
+    ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+    using Delta =
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
+        ThreadMap::Delta::kContiguous>;
+    /// Maps thread ID to a coordinate offset within the tensor's logical
+    /// coordinate space Note this is slightly different from the one of
+    /// PitchLinearWarpRakedThreadMap.
+    CUTLASS_HOST_DEVICE
+        static TensorCoord initial_offset(int thread_id) {
+        TensorCoord coord = ThreadMap::initial_offset(thread_id);
+        return TensorCoord(
+            coord.strided(),
+            coord.contiguous()
+        );
+    }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace transform
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/transpose.h ADDED Viewed

	@@ -0,0 +1,107 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Basic copy routines for tensor views
+*/
+#pragma once
+namespace cutlass {
+namespace transform {
+namespace thread {
+/// Transforms a fragment by doing a transpose
+template <
+  int ElementCount,
+  typename TransposeShape,
+  typename Element
+> struct Transpose;
+/// Specialization for int8_t 4x4 transpose
+template <int ElementCount_>
+struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
+    static const int kElementCount = ElementCount_;
+    using TransposeShape = layout::PitchLinearShape<4,4>;
+    using Element = int8_t;
+    using Fragment = cutlass::Array<Element, kElementCount>;
+    static_assert(!(kElementCount % TransposeShape::kCount), "Shape needs to be multiple of 16 elements to do a 4x4 transpose");
+    CUTLASS_DEVICE
+    void transform(Fragment& dst, Fragment& src) {
+    // Expose src/dst as int arrays.
+    int* src_int = reinterpret_cast<int*>(&src);
+    int* dst_int = reinterpret_cast<int*>(&dst);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementCount / TransposeShape::kCount; i++){
+      int const i0 = 4 * i + 0;
+      int const i1 = 4 * i + 1;
+      int const i2 = 4 * i + 2;
+      int const i3 = 4 * i + 3;
+      int a0 = src_int[i0];
+      int a1 = src_int[i1];
+      int a2 = src_int[i2];
+      int a3 = src_int[i3];
+      int b0, b1, b2, b3, c0;
+      b0 = __byte_perm(a0, a1, 0x0040);
+      c0 = __byte_perm(a2, a3, 0x0040);
+      b0 = __byte_perm(b0, c0, 0x5410);
+      b1 = __byte_perm(a0, a1, 0x0051);
+      c0 = __byte_perm(a2, a3, 0x0051);
+      b1 = __byte_perm(b1, c0, 0x5410);
+      b2 = __byte_perm(a0, a1, 0x0062);
+      c0 = __byte_perm(a2, a3, 0x0062);
+      b2 = __byte_perm(b2, c0, 0x5410);
+      b3 = __byte_perm(a0, a1, 0x0073);
+      c0 = __byte_perm(a2, a3, 0x0073);
+      b3 = __byte_perm(b3, c0, 0x5410);
+      dst_int[i0] = b0;
+      dst_int[i1] = b1;
+      dst_int[i2] = b2;
+      dst_int[i3] = b3;
+    }
+  }
+};
+}  // namespace thread
+}  // namespace layout
+}  // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/unary_op.h ADDED Viewed

	@@ -0,0 +1,105 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+namespace cutlass {
+namespace transform {
+namespace thread {
+namespace UnaryTransform {
+    struct Identity;    ///< None (i.e., identity)
+    struct Conjugate;   ///< Complex conjugate
+}
+/// Element-wise unary operator that transforms one element of a fragment at a time
+template<
+    typename FragmentIn, ///< Input Fragment
+    typename FragmentOut,///< Output Fragment
+    typename Transform>  ///< Unary transform operator
+class UnaryOp
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentOut execute(FragmentIn &in)
+        {
+            static_assert(FragmentIn::kElements == FragmentOut::kElements, "Number of elements must match.");
+            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
+                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+            FragmentOut out;
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                CUTLASS_PRAGMA_UNROLL
+                for (int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = static_cast<typename FragmentOut::Element>(in[i]);
+                }
+            }
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for (int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = conj(static_cast<typename FragmentOut::Element>(in[i]));
+                }
+            }
+            return out;
+        }
+};
+template<typename FragmentIn, typename Transform>
+class UnaryOp<FragmentIn, FragmentIn, Transform>
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentIn execute(FragmentIn &in)
+        {
+            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
+                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                return in;
+            }
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for(int i=0; i < FragmentIn::kElements; ++i){
+                   in[i] = conj(in[i]);
+                }
+            }
+            return in;
+        }
+      };
+    }
+  }
+}

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_iterator.h ADDED Viewed

	@@ -0,0 +1,199 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for matrix of indices (ellColInd matrix)
+*/
+#pragma once
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+namespace ell{
+constexpr unsigned int SmemPow = 8;
+constexpr unsigned int SmemStages = 2;
+constexpr unsigned int SmemSize = 1 << SmemPow;
+constexpr unsigned int SmemMask = (SmemSize*SmemStages-1);
+class SharedStorage{
+  public:
+    Array<int, SmemSize*SmemStages> array;
+};
+class Iterator{
+  public:
+  using Layout = layout::PitchLinear;
+  using LongIndex = typename Layout::LongIndex;
+  private:
+    const int *gmem_col_idx_;
+    int *smem_col_idx_;
+    const int  block_size_;
+    const int  base_idx_;
+    const int  k_shape_;
+    const int  ell_increment_;
+    const int  array_length_;
+    int  col_idx_base_;
+    int  residue_;
+    int  counter_;
+    int  pow2_;
+    int  residue_shape_;
+    int  smem_offset_;
+    int  smem_stage_;
+    int  gmem_offset_;
+    int  lane_;
+    bool is_pow2_;
+    bool is_residue_tile_;
+  public:
+    CUTLASS_DEVICE
+    void load_ell_indices(){
+      for(int i=threadIdx.x; i<SmemSize; i+=blockDim.x){
+        int idx = (gmem_offset_+i < array_length_) ? gmem_offset_+i : array_length_-1;
+        int gmem_col_idx = gmem_col_idx_[idx] - base_idx_;
+        smem_col_idx_[i + smem_stage_ * SmemSize] =
+          (gmem_col_idx >= 0) ? gmem_col_idx : -1;
+      }
+      gmem_offset_ += SmemSize;
+      smem_stage_ ^= 1;
+    }
+    CUTLASS_DEVICE
+    Iterator(
+        SharedStorage& shared_storage_base,
+        const int* col_idx,
+        const int& block_size,
+        const int& base_idx,
+        const int  k_shape,
+        const int& problem_size_k,
+        const int& ell_stride,
+        const int& thread_idx)
+        : residue_(0),
+          counter_(0),
+          smem_offset_(0),
+          smem_stage_(0),
+          gmem_offset_(0),
+          block_size_(block_size),
+          base_idx_(base_idx),
+          k_shape_(k_shape),
+          ell_increment_(ell_stride * block_size),
+          array_length_((problem_size_k + block_size_ - 1) / block_size_),
+          residue_shape_(problem_size_k % k_shape_),
+          is_residue_tile_(residue_shape_ != 0),
+          smem_col_idx_(reinterpret_cast<int*>(&shared_storage_base.array)),
+          gmem_col_idx_(const_cast<int*>(col_idx)),
+          lane_(thread_idx % 32) {
+      load_ell_indices();
+      __syncthreads();
+      is_pow2_ = ((block_size_ & (block_size_ - 1)) == 0);
+      if( is_pow2_ && k_shape <= block_size_ ) lane_ = 0;
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_;
+      pow2_ = 0;
+      while(block_size_ >> (pow2_ + 1)) ++pow2_;
+    }
+    CUTLASS_DEVICE
+    int get_blocksize(){
+      return block_size_;
+    }
+    CUTLASS_DEVICE
+    Iterator &operator++(){
+      if(is_residue_tile_){
+        residue_ += residue_shape_;
+        is_residue_tile_ = false;
+      } else {
+        residue_ += k_shape_;
+      }
+      if(residue_ < block_size_){
+        return *this;
+      }
+      if((array_length_ > SmemSize) && (((smem_offset_ >> SmemPow) & 1) != smem_stage_))
+        load_ell_indices();
+      if(residue_ == block_size_){
+        ++smem_offset_;
+        counter_ += ell_increment_;
+        residue_ = 0;
+        col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+        return *this;
+      }
+      if(is_pow2_){
+        smem_offset_ += residue_ >> pow2_;
+        counter_ += (residue_ >> pow2_) * ell_increment_;
+        residue_ = residue_ & ((1 << pow2_) - 1);
+      }
+      else {
+        smem_offset_ += residue_ / block_size_;
+        counter_ += (residue_ / block_size_) * ell_increment_;
+        residue_ %= block_size_;
+      }
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+      return *this;
+    }
+    CUTLASS_DEVICE
+    LongIndex get_offset(const int& idx) {
+      int num_jump_tiles;
+      if(is_pow2_)
+        num_jump_tiles = (idx + residue_) >> pow2_;
+      else
+        num_jump_tiles = (idx + residue_) / block_size_;
+      int tmp = __shfl_sync(0xffffffff, col_idx_base_, num_jump_tiles);
+      return tmp - num_jump_tiles * ell_increment_;
+    }
+    CUTLASS_DEVICE
+    LongIndex get_offset_fast() {
+      return col_idx_base_;
+    }
+};
+}
+}
+}
+}

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h ADDED Viewed

	@@ -0,0 +1,1350 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaMultistage
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// EllPredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType>
+class EllPredicatedTileAccessIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileAccessIterator;
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    LongIndex stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_(layout.stride(0)) {
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+    };
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+  /// Size of tensor
+  TensorCoord extent_;
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+  /// Initial offset to define ELL block
+  TensorCoord ell_offset_;
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+      TensorCoord coord = thread_offset_ + iteration_coord;
+      bool guard;
+      if (is_steady_state) {
+        if (kAdvanceRank == 0) {
+          guard = (coord.strided() < extent.strided());
+        } else {
+          guard = (coord.contiguous() < extent.contiguous());
+        }
+      } else {
+        guard = (coord.strided() < extent.strided() &&
+                 coord.contiguous() < extent.contiguous());
+      }
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+    }
+  }
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        extent_(extent),
+        is_residue_tile_(true) {
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(),
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+      residue_offset_ = make_Coord(residue_size, 0);
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+    // Per-thread offset in logical coordinates of tensor
+    ell_offset_ = ThreadMap::initial_offset(thread_id);
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+    compute_predicates_(residue_extent, false);
+    set_iteration_index(0);
+  }
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+      thread_offset_ += residue_offset_;
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(residue_offset_));
+      compute_predicates_(extent_, true);
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    }
+    is_residue_tile_ = false;
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+  /// Returns a k_location
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    if(kAdvanceRank){ //strided
+      return ell_offset_.strided() + iteration_strided_ * ThreadMap::Delta::kStrided;
+    }else{
+      return ell_offset_.contiguous() + iteration_contiguous_ * ThreadMap::Delta::kContiguous + iteration_vector_ * AccessType::kElements;
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    if(kAdvanceRank)
+      return params_.stride_;
+    else
+      return 1;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+    return *this;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    Mask mask;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = 0u;
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+      TensorCoord coord = ell_offset_ + iteration_coord;
+      bool guard;
+      if (kAdvanceRank == 0) {
+        guard = (coord.strided() < blocksize);
+      } else {
+        guard = (coord.contiguous() < blocksize);
+      }
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      mask[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] &= predicates_[i];
+    }
+    set_mask(mask);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    int pred_idx =
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileAccessIterator for column-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileAccessIterator for row-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h ADDED Viewed

	@@ -0,0 +1,1315 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaPipelined
+*/
+#pragma once
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+#include "cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h"
+#include "cutlass/transform/threadblock/ell_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// EllPredicatedTileIterator
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::EllPredicatedTileIterator;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class EllPredicatedTileIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize>
+class EllPredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
+                             ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      EllPredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileIterator;
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return address_iterator_.get_stride(); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { address_iterator_.ell_add_mask(blocksize); }
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator &ell_iter) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          LongIndex ell_offset = 0;
+          int k_offset = address_iterator_.get_k();
+          ell_offset = ell_iter.get_offset(k_offset) * sizeof(Element);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator &ell_iter) {
+    LongIndex ell_offset = ell_iter.get_offset_fast() * sizeof(Element);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend EllPredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend EllPredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+    };
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileIterator for interleaved data.  It is mapped
+/// to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::ColumnMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of EllPredicatedTileIterator for interleaved-32 data.  It is
+/// mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::RowMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h ADDED Viewed

	@@ -0,0 +1,375 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+    This iterator uses masks to guard out-of-bounds accesses.
+    It can be used to load the gamma and beta vectors of layernorm which is loop variant.
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedScaleBiasVectorAccessIterator
+///
+template <typename ThreadblockShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorAccessIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
+///
+template <typename ThreadblockShape_, typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                              Element_,
+                                              layout::PitchLinear> {
+ public:
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+  TensorCoord thread_offset_;
+  int problem_size_k_;
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+  bool guard_;
+  TensorCoord::Index residue_size_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+    problem_size_k_ = problem_size_k;
+    is_residue_tile_ = true;
+    residue_size_ = (problem_size_k_ - threadblock_offset.contiguous()) % ThreadblockShape::kContiguous;
+    if (residue_size_ == 0) {
+      residue_size_ = ThreadblockShape::kContiguous;
+    }
+    guard_ = ((thread_id - thread_base) * kElementsPerAccess) < residue_size_;
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+    set_iteration_index(0);
+  }
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    guard_ = threadIdx.x < kThreads * 2;
+    TensorCoord offset = is_residue_tile_ ?
+      TensorCoord(residue_size_ + ThreadblockShape::kContiguous * (tile_offset.contiguous() - 1), 0)
+      : TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
+    thread_offset_ =
+        thread_offset_ +
+        offset;
+    is_residue_tile_ = false;
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    return *this;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    guard_ &= (!enable);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return guard_;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename ThreadblockShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Extent of tensor
+      int problem_size_k,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size_k, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      int problem_size_k,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h ADDED Viewed

	@@ -0,0 +1,328 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+    This iterator uses masks to guard out-of-bounds accesses.
+    This can be used to load var and mean vectors in layernorm which is loop invariant.
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedScaleBiasVectorIterator
+///
+template <typename WarpShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
+///
+template <typename WarpShape_, typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::PitchLinear> {
+ public:
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  static int const kElementsPerAccess = 1;
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+  static int const kIterations = WarpShape::kContiguous / 8;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Internal pointer to first access of tile
+  ConstPointer scale_pointer_;
+  ConstPointer bias_pointer_;
+  /// Size of tensor
+  int problem_size_;
+  int32_t thread_offset_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : problem_size_(problem_size),
+        scale_pointer_(scale_pointer),
+        bias_pointer_(bias_pointer) {
+    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
+  }
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    frag.fill(__float2half2_rn(0.0f));
+    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
+    // load scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2].x,
+        scale_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_
+      );
+    }
+    // load bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2 + 1].x,
+        bias_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_
+      );
+    }
+    // duplicate scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
+    }
+    // duplicate bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
+    }
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename WarpShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  using Fragment = typename UnderlyingIterator::Fragment;
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      ///< Extent of tensor
+      int problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      int problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load(frag);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h ADDED Viewed

	@@ -0,0 +1,2118 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we
+    only need to compute the predicates twice, once before the first tile and
+    once for the remaining full tiles which can share the same predicates.
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedTileAccessIteratorPredicates
+///
+template <typename Shape_, typename Element_, typename Layout_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIteratorPredicates {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = typename Layout::TensorCoord;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+// private:
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+  /// Size of tensor
+  TensorCoord extent_;
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+      TensorCoord coord = thread_offset_ + iteration_coord;
+      bool guard;
+      if (is_steady_state) {
+        if (kAdvanceRank == 0) {
+          guard = (coord.strided() < extent.strided());
+        } else {
+          guard = (coord.contiguous() < extent.contiguous());
+        }
+      } else {
+        guard = (coord.strided() < extent.strided() &&
+                 coord.contiguous() < extent.contiguous());
+      }
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  void set_predicates(int thread_id, TensorCoord const &threadblock_offset) {
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(),
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+      residue_offset_ = make_Coord(residue_size, 0);
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+    compute_predicates_(residue_extent, false);
+    set_iteration_index(0);
+  }
+  /// Default constructor
+  PredicatedTileAccessIteratorPredicates() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorPredicates(
+      /// Extent of tensor
+      TensorCoord extent)
+      : extent_(extent) {
+	}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorPredicates &operator++() {
+    return *this;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    int pred_idx =
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType, bool Gather = false,
+          typename PermuteLayout = layout::NoPermute>
+class PredicatedTileAccessIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape, Element, Layout, AdvanceRank, ThreadMap, AccessType>;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+  static bool constexpr Permute = !platform::is_same<PermuteLayout, layout::NoPermute>::value
+                               && !platform::is_same<PermuteLayout, layout::InversePermute<layout::NoPermute>>::value;
+  using Mask = typename UnderlyingPredicates::Mask;
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) :
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  UnderlyingPredicates the_predicates;
+  /// Parameters object with precomputed internal state
+  Params params_;
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  /// Gather indices
+  int const *indices_;
+  /// Function to perform layout permutation and offset computation
+  PermuteLayout permute_layout_;
+  /// Tracks thread's coordinate offset in the matrix for current tile.
+  /// This is only used in the following cases:
+  /// - when Gather is true, strided coordinate needed to access indices (contiguous offset is tracked via pointer_)
+  /// - when Permute is true, both coordinates are needed as input into permutation function (pointer_ is fixed)
+  TensorCoord coord_offset_;
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+	  the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : params_(params),
+	      pointer_(reinterpret_cast<BytePointer>(
+                 const_cast<NonConstPointer>(pointer))),
+	      the_predicates(extent),
+        is_residue_tile_(true),
+        indices_(indices),
+        permute_layout_(TensorCoord(extent.contiguous(), extent.strided()), params.stride_) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    if (Gather) {
+      assert(indices_);
+    }
+    // update internal pointers
+    Layout layout(params_.stride_);
+    if (!Gather && !Permute) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      coord_offset_ = the_predicates.thread_offset_;
+      if (!Permute) {
+        add_pointer_offset(layout(make_Coord(coord_offset_.contiguous(), 0)));
+      }
+    }
+  }
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+      the_predicates.compute_predicates_(the_predicates.extent_, true);
+      Layout layout(params_.stride_);
+      if (!Gather && !Permute) {
+        add_pointer_offset(layout(the_predicates.residue_offset_));
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+          pointer_ += Shape::kContiguous * tile_offset.contiguous() * sizeof_bits<Element>::value / 8;
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+          pointer_ += Shape::kStrided * tile_offset.strided() * sizeof_bits<Element>::value / 8;
+        }
+      } else {
+        coord_offset_.strided() = the_predicates.thread_offset_.strided() + Shape::kStrided * (tile_offset.strided() - kAdvanceRank);
+        if (!Permute) {
+          add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
+          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank)));
+        } else {
+          coord_offset_.contiguous() = the_predicates.thread_offset_.contiguous() + Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank));
+        }
+      }
+    } else {
+      if (!Gather && !Permute) {
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+          pointer_ += Shape::kStrided * tile_offset.strided();
+        }
+      } else {
+        coord_offset_.strided() += Shape::kStrided * tile_offset.strided();
+        if (!Permute) {
+          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        } else {
+          coord_offset_.contiguous() += Shape::kContiguous * tile_offset.contiguous();
+        }
+      }
+    }
+    is_residue_tile_ = false;
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    if (Gather || Permute)
+    {
+      if (!valid()) {
+        return nullptr;
+      }
+      Index coord_contig  = (Permute ? coord_offset_.contiguous() : 0) + the_predicates.iteration_contiguous_ * ThreadMap::Delta::kContiguous + the_predicates.iteration_vector_ * AccessType::kElements;
+      Index coord_strided = coord_offset_.strided() + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+      if (Gather) {
+        coord_strided = indices_[coord_strided];
+      }
+      LongIndex offset = Permute ? permute_layout_(TensorCoord(coord_contig, coord_strided)) : (coord_strided * LongIndex(params_.stride_) + coord_contig);
+      return reinterpret_cast<AccessType *>(pointer_ + OffsetBytes<Element>(offset));
+    }
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + the_predicates.iteration_vector_;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    // Enter here only if (iteration_contiguous_ == ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather && !Permute) {
+        pointer_ += params_.inc_strided_;
+      }
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+    if (!Gather && !Permute) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+      // now return to start tile - if the iterator is subsequently advanced, this
+      // subtraction as well as the subsequent integer addition are both elided by
+      // the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+    return *this;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    the_predicates.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    the_predicates.get_mask(mask);
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column()),
+                  indices) {}
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row()),
+                  indices) {}
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for affine rank 2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap, AccessType>;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIterator;
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+                     sizeof_bits<Element>::value / 8;
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+      inc_next_strided_ = inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ - LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+  //
+  // Data members
+  //
+  /// Parameters object with precomputed internal state
+  Params params_;
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+  UnderlyingPredicates the_predicates;
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+          the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+	is_residue_tile_(true) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(the_predicates.residue_offset_));
+      the_predicates.compute_predicates_(the_predicates.extent_, true);
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1] - 1);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0] - 1);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    }
+    is_residue_tile_ = false;
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(pointer_) + the_predicates.iteration_vector_;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { the_predicates.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { the_predicates.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { the_predicates.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for affine rank-2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class PredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for row-major interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class PredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h ADDED Viewed

	@@ -0,0 +1,834 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+   from pitch-linear rank=2 tensors.
+    This iterator uses masks to guard out-of-bounds accesses and visits the last
+   "residue" tile first, with the objective of minimizing predicate mask updates
+   during steady-state operation.
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedTileAccessIterator2dThreadTile
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType>
+class PredicatedTileAccessIterator2dThreadTile;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount = (ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kStrided + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+   public:
+    friend PredicatedTileAccessIterator2dThreadTile;
+    using Base = PredicatedTileAccessIteratorParams;
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) :
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+  /// Size of tensor
+  TensorCoord extent_;
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+  /// Index of residue tile
+  int residue_tile_idx_;
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+  /// Tracks iterations within the thread loop
+  int iteration_thread_;
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_HOST_DEVICE
+  void compute_predicates_(
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++) {
+          TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous,
+                                      ts + s * ThreadMap::Delta::kStrided);
+          TensorCoord coord = thread_offset_ + iteration_coord;
+          bool guard;
+          if (is_steady_state) {
+            if (kAdvanceRank == 0) {
+              guard = (coord.strided() < extent_.strided());
+            } else {
+              guard = (coord.contiguous() < extent_.contiguous());
+            }
+          } else {
+            guard = (coord.strided() < extent_.strided() &&
+                     coord.contiguous() < extent_.contiguous());
+          }
+          int pred_idx = ts + c *  ThreadMap::ThreadAccessShape::kStrided + s * ThreadMap::Iterations::kContiguous *  ThreadMap::ThreadAccessShape::kStrided;
+          int word_idx = pred_idx / kPredicatesPerWord;
+          int residual = pred_idx % kPredicatesPerWord;
+          int byte_idx = residual / kPredicatesPerByte;
+          int bit_idx = residual % kPredicatesPerByte;
+          predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+        }
+      }
+    }
+  }
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        extent_(extent),
+        is_residue_tile_(true) {
+    TensorCoord residue_offset;
+    if (kAdvanceRank) {
+      residue_tile_idx_ =
+          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
+          Shape::kStrided;
+      residue_offset = make_Coord(0, residue_tile_idx_ * Shape::kStrided);
+    } else {
+      residue_tile_idx_ =
+          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
+          Shape::kContiguous;
+      residue_offset = make_Coord(residue_tile_idx_ * Shape::kContiguous, 0);
+    }
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + residue_offset +
+                     ThreadMap::initial_offset(thread_id);
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+    compute_predicates_(false);
+    set_iteration_index(0);
+  }
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    int residual = index % (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
+    iteration_strided_ = index / (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
+    iteration_contiguous_ = residual / ThreadMap::ThreadAccessShape::kStrided;
+    iteration_thread_ = residual % ThreadMap::ThreadAccessShape::kStrided;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += int(sizeof(Element)) * pointer_offset;
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+      TensorCoord residue_offset;
+      if (kAdvanceRank) {
+        residue_offset = TensorCoord(0, residue_tile_idx_ * Shape::kStrided);
+      } else {
+        residue_offset = TensorCoord(residue_tile_idx_ * Shape::kContiguous, 0);
+      }
+      thread_offset_ -= residue_offset;
+      Layout layout(params_.stride_);
+      add_pointer_offset(-layout(residue_offset));
+      compute_predicates_(true);
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * (tile_offset.strided() - 1);
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * (tile_offset.contiguous() - 1);
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * tile_offset.strided();
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * tile_offset.contiguous();
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    }
+    is_residue_tile_ = false;
+  }
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *ret_val = reinterpret_cast<AccessType *>(
+                pointer_ + (iteration_thread_ * params_.stride_  + iteration_contiguous_ * ThreadMap::Delta::kContiguous) * int(sizeof(Element)));
+    return ret_val;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+    iteration_thread_++;
+    if (iteration_thread_ < ThreadMap::ThreadAccessShape::kStrided)
+      return *this;
+    iteration_thread_ = 0;
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+    return *this;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    int pred_idx =
+      iteration_thread_ +
+      iteration_contiguous_ * ThreadMap::ThreadAccessShape::kStrided +
+      iteration_strided_ * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator2dThreadTile;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){}
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator2dThreadTile;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){}
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h ADDED Viewed

	@@ -0,0 +1,290 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Predicated tile access iterator descriptor object containing template dependent state
+struct PredicatedTileAccessIteratorDesc {
+  int element_size_bits = -1;
+  int advance_rank = -1;
+  layout::PitchLinearCoord threadblock_shape;
+  layout::PitchLinearCoord threadmap_iterations;
+  layout::PitchLinearCoord threadmap_delta;
+  //
+  // Methods
+  //
+  PredicatedTileAccessIteratorDesc() = default;
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc(
+    int element_size_bits_,
+    int advance_rank_,
+    layout::PitchLinearCoord threadblock_shape_,
+    layout::PitchLinearCoord threadmap_iterations_,
+    layout::PitchLinearCoord threadmap_delta_
+  ):
+    element_size_bits(element_size_bits_),
+    advance_rank(advance_rank_),
+    threadblock_shape(threadblock_shape_),
+    threadmap_iterations(threadmap_iterations_),
+    threadmap_delta(threadmap_delta_)
+  {
+    #if 0
+    printf("PredicatedTileAccessIteratorDesc(%d, %d, {%d, %d}, {%d, %d}, {%d, %d}})\n",
+      element_size_bits,
+      advance_rank,
+      threadblock_shape.contiguous(), threadblock_shape.strided(),
+      threadmap_iterations.contiguous(), threadmap_iterations.strided(),
+      threadmap_delta.contiguous(), threadmap_delta.strided());
+    #endif
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper template to construct an PredicatedTileAccessIteratorDesc from a template
+// dependent state
+template <
+  typename Shape, typename Element, typename Layout,
+  int AdvanceRank, typename ThreadMap>
+  struct MakePredicatedTileAccessIteratorDesc;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+template <
+  typename Shape, typename Element, int AdvanceRank,
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap> {
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+    return PredicatedTileAccessIteratorDesc(
+      sizeof_bits<Element>::value,
+      AdvanceRank,
+      {Shape::kContiguous, Shape::kStrided},
+      {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+      {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+    );
+}
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for column-major data.
+template <
+  typename Shape, typename Element, int AdvanceRank,
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::ColumnMajor, AdvanceRank, ThreadMap> {
+  static int const kAdvanceRank = AdvanceRank;
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+    return UnderlyingMakeOperator()();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+template <
+  typename Shape, typename Element, int AdvanceRank,
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::RowMajor, AdvanceRank, ThreadMap> {
+  static int const kAdvanceRank = AdvanceRank;
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+    return UnderlyingMakeOperator()();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.
+template <
+  typename Shape, typename Element, int AdvanceRank,
+  typename ThreadMap, int InterleavedK>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::ColumnMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kInterleavedK = InterleavedK;
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+    return UnderlyingMakeOperator()();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIterator for roww-major interleaved data.
+template <
+  typename Shape, typename Element, int AdvanceRank,
+  typename ThreadMap, int InterleavedK>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::RowMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kInterleavedK = InterleavedK;
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+    return UnderlyingMakeOperator()();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Parameters struct
+//
+struct PredicatedTileAccessIteratorParams {
+  using Index = int32_t;
+  using LongIndex = int64_t;
+  //
+  // Data members
+  //
+  /// stride of pitch-linear layout (units of Element)
+  LongIndex stride_ = 0;
+  /// amount (in byte) to increment pointer to move to next access along
+  /// strided dimension
+  LongIndex inc_strided_ = 0;
+  /// amount (in byte) to increment pointer from last access to first access
+  /// of next tile
+  LongIndex inc_next_ = 0;
+  /// amount (in byte) to increment pointer from first access of current tile
+  /// to first access of next tile
+  LongIndex inc_advance_ = 0;
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
+    CUTLASS_ASSERT(desc.element_size_bits > 0);
+    CUTLASS_ASSERT(desc.advance_rank == 0 || desc.advance_rank == 1);
+    stride_ = stride;
+    inc_strided_ = (LongIndex(stride_) * desc.threadmap_delta.strided()) *
+                     desc.element_size_bits / 8;
+    if (desc.advance_rank) {
+      // advance along strided dimension
+      inc_advance_ =
+          desc.threadblock_shape.strided() * LongIndex(stride_) * desc.element_size_bits / 8;
+    } else {
+      // advance along contiguous dimension
+      inc_advance_ = desc.threadblock_shape.contiguous() * desc.element_size_bits / 8;
+    }
+    inc_next_ = inc_advance_ - LongIndex(desc.threadmap_iterations.strided() - 1) *
+                                   desc.threadmap_delta.strided() * LongIndex(stride_) *
+                                   desc.element_size_bits / 8;
+    return Status::kSuccess;
+  }
+  CUTLASS_HOST_DEVICE
+  Status initialize(Index stride, PredicatedTileAccessIteratorDesc desc) {
+    return initialize(LongIndex(stride), desc);
+  }
+  PredicatedTileAccessIteratorParams() = default;
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorParams(Index stride, PredicatedTileAccessIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorParams(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h ADDED Viewed

	@@ -0,0 +1,892 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+   from pitch-linear rank=2 tensors.
+    This iterator uses masks to guard out-of-bounds accesses and visits the last
+   "residue" tile first, with the objective of minimizing predicate mask updates
+   during steady-state operation.
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+#pragma once
+#include "cutlass/blas3.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedTileAccessIteratorTriangularMatrix
+///
+template <typename Shape, typename Element, typename Layout,
+          int AdvanceRank, typename ThreadMap,
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType,
+          typename AccessType>
+class PredicatedTileAccessIteratorTriangularMatrix;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  using CompareOp = typename TrMatrixCompareOp<kFillMode, kDiagType>::Type;
+  static_assert( kFillMode == FillMode::kFull ||
+                 ((kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) && AccessType::kElements == 1),
+                 "BLAS3 iterator for the triangular/symmetric matrix must use AccessType::kElements as 1");
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    StrideIndex stride_;
+    /// (true)  pitch-linear layout is mapped to row-major matrix
+    /// (false) pitch-linear layout is mapped to column-major matrix
+    bool is_row_major_;
+    /// for vectorized access across the diagonal boundary guard condition is
+    /// checked for the element on the boundary
+    int access_diagonal_boundary_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0), is_row_major_(false), access_diagonal_boundary_(0) { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, bool is_row_major, int access_diagonal_boundary) :
+      stride_(layout.stride(0)), is_row_major_(is_row_major), access_diagonal_boundary_(access_diagonal_boundary) {
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+    };
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+  /// Track global memory addresses on the diagonal
+  /// To ignore imag part for diagonal elements of hermitian matrices
+  uint32_t predicates_onDiag_[kPredicateWordCount];
+  /// Size of tensor
+  TensorCoord extent_;
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+      predicates_onDiag_[i] = 0u;
+    }
+    CompareOp compare_op;
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+      TensorCoord coord = thread_offset_ + iteration_coord;
+      bool guard;
+      bool onDiag = false;
+      guard = ((coord.strided() < extent.strided()) &&
+                (coord.contiguous() < extent.contiguous()));
+      // guard access on the wrong side of the triagular matrix diagonal
+      if (kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) {
+        coord += TensorCoord{params_.access_diagonal_boundary_, 0};
+        bool triagular_guard_row_major = compare_op(coord.strided(), coord.contiguous()) | !params_.is_row_major_;
+        bool triagular_guard_col_major = compare_op(coord.contiguous(), coord.strided()) | params_.is_row_major_;
+        guard = guard && triagular_guard_row_major && triagular_guard_col_major;
+        if (kDiagType == DiagType::kUnit) {
+          onDiag = (guard && coord.strided() == coord.contiguous()) ? true : false;
+        }
+      }
+      int pred_idx_onDiag = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx_onDiag = pred_idx_onDiag / kPredicatesPerWord;
+      int residual_onDiag = pred_idx_onDiag % kPredicatesPerWord;
+      int byte_idx_onDiag = residual_onDiag / kPredicatesPerByte;
+      int bit_idx_onDiag = residual_onDiag % kPredicatesPerByte;
+      predicates_onDiag_[word_idx_onDiag] |= (unsigned(onDiag) << (byte_idx_onDiag * 8 + bit_idx_onDiag));
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+    }
+  }
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+        extent_(extent) {
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+    compute_predicates_(extent_);
+    set_iteration_index(0);
+  }
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+      pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      thread_offset_ += TensorCoord{0, Shape::kStrided * tile_offset.strided()};
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+      pointer_ += Shape::kStrided * tile_offset.strided();
+      thread_offset_ += TensorCoord{Shape::kContiguous * tile_offset.contiguous(), 0};
+    }
+    compute_predicates_(extent_);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+    return *this;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    int pred_idx =
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    bool pred = (predicates_onDiag_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    int pred_idx =
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    //return true;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_,
+            SideMode kSideMode, FillMode kFillMode, DiagType kDiagType,
+            typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType,
+                                   AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      kSideMode, kFillMode, kDiagType, AccessType>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  static int const kAccessDiagonalBoundary =
+    (kFillMode == FillMode::kLower) ? (AccessType::kElements - 1) : 0;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), false, kAccessDiagonalBoundary){};
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_,
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType,
+          typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_,
+                                                  kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      kSideMode, kFillMode, kDiagType, AccessType>;
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+  static int const kAccessDiagonalBoundary =
+    (kFillMode == FillMode::kUpper) ? (AccessType::kElements - 1) : 0;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), true, kAccessDiagonalBoundary){};
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h ADDED Viewed

	@@ -0,0 +1,1887 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors.
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we
+    only need to compute the predicates twice, once before the first tile and
+    once for the remaining full tiles which can share the same predicates.
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+#pragma once
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedTileIterator
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIterator;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int AccessSize = ThreadMap::kElementsPerAccess,
+  bool Gather = false,
+  typename PermuteLayout = layout::NoPermute
+>
+class PredicatedTileIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, bool Gather, typename PermuteLayout>
+class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType, Gather, PermuteLayout>;
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+    friend PredicatedTileIterator;
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) {}
+    /// Default constructor
+    Params() = default;
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base)
+        : params_(base) {}
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset, indices) {}
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize,
+  bool Gather,
+  typename PermuteLayout
+>
+class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank,
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize,
+    Gather,
+    PermuteLayout
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0)))
+    {}
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+      indices)
+    { }
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize,
+  bool Gather,
+  typename PermuteLayout
+>
+class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank,
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize,
+    Gather,
+    PermuteLayout
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,        ///< Initial offset of threadblock
+    int const *indices = nullptr                        ///< Gather indices
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+      indices
+    ) { }
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRank,
+                             ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIterator;
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) {}
+    /// Default constructor
+    Params() = default;
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::AffineRankN<2>,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1)))
+    {}
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for affine rank 2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::AffineRankN<2>,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for interleaved data.  It is mapped
+/// to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class PredicatedTileIterator<Shape_, Element_,
+                             layout::ColumnMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator for interleaved-32 data.  It is
+/// mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class PredicatedTileIterator<Shape_, Element_,
+                             layout::RowMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIterator;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+   public:
+    /// Default constructor
+    Params() = default;
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Default constructor
+  PredicatedTileIterator() = default;
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h ADDED Viewed

	@@ -0,0 +1,787 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors.
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+#pragma once
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h"
+#include "cutlass/transform/thread/transpose.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedTileIterator2dThreadTile
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIterator2dThreadTile;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  bool Transpose = false
+>
+class PredicatedTileIterator2dThreadTile;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, bool Transpose_>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Transpose_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  /// Type used for internal memory accesses
+  /// extra set of parenthesis is needed for VS compiler
+  struct alignas((ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value /
+                  8)) AccessType {
+    Array<Element, ThreadMap::kElementsPerAccess> storage;
+    static int const kElements = ThreadMap::kElementsPerAccess;
+  };
+  /// Optionally this fragment can be 4x4 transposed
+  using Transform = thread::Transpose< ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount , layout::PitchLinearShape<4,4>, Element>;
+  static bool const transpose = Transpose_;
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator2dThreadTile<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::ThreadAccessShape::kCount>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+    friend PredicatedTileIterator2dThreadTile;
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base)
+        : params_(base) {}
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
+          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
+              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+          address_iterator_.set_iteration_index(access_idx);
+          if (address_iterator_.valid()) {
+            frag_ptr[access_idx] =
+                *(address_iterator_.get() + pointer_offset);
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+    if (transpose) {
+      Transform t;
+      t.transform(frag, frag);
+    }
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
+          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
+              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+          address_iterator_.set_iteration_index(access_idx);
+          if (address_iterator_.valid()) {
+            *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  bool Transpose_
+>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Transpose_> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static bool const Transpose = Transpose_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    Transpose
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIterator2dThreadTile;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  bool Transpose_
+>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Transpose_> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static bool const Transpose = Transpose_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    Transpose
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIterator2dThreadTile;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) { }
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h ADDED Viewed

	@@ -0,0 +1,818 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors.
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+#pragma once
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedTileIteratorTriangularMatrix
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIteratorTriangularMatrix;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  SideMode kSideMode,
+  FillMode kFillMode,
+  DiagType kDiagType,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class PredicatedTileIteratorTriangularMatrix;
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIteratorTriangularMatrix for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_,
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType,
+          int AccessSize>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_,
+                                             kSideMode, kFillMode, kDiagType,
+                                             AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIteratorTriangularMatrix<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, kSideMode, kFillMode, kDiagType, AccessType>;
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorTriangularMatrix;
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode,
+  FillMode kFillMode,
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_,
+                                              kSideMode, kFillMode, kDiagType,
+                                              AccessSize> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    AccessSize
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIteratorTriangularMatrix;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+    }
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedTileIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode,
+  FillMode kFillMode,
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_,
+                                            kSideMode, kFillMode, kDiagType,
+                                            AccessSize> {
+public:
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    AccessSize
+  >;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+    friend PredicatedTileIteratorTriangularMatrix;
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+  public:
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+    };
+  };
+private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h ADDED Viewed

	@@ -0,0 +1,417 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of loading small
+    vectors from the global memory.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// PredicatedVectorAccessIterator
+///
+template <
+    /// Shape of the vector accessed by the entire threadblock
+    typename Shape,
+    /// Shape of the vector accessed by the warp
+    typename WarpShape,
+    /// Type of Element
+    typename Element,
+    /// Layout of the vector
+    typename Layout,
+    /// Number of elements for each access
+    int ElementsPerAccess,
+    /// Support residual tile
+    bool EnableResidualAccess = false
+>
+class PredicatedVectorAccessIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Vector access iterator specialized for vectors, e.g. scale and bias
+/// Thread arrangements are for TensorOps
+///
+template <
+  typename Shape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator <
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::PitchLinear,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
+  public:
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+//  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kThreads = 32;
+  static int const kRowsPerIteration = 8;
+  static int const kThreadsPerRow = kThreads / kRowsPerIteration;
+  static int const kThreadsPerRowMask = 0x3;
+  static int const kIterations = WarpShape::kContiguous / (kThreadsPerRow * kElementsPerAccess);
+  static int const kWarpCountStrided = Shape::kStrided / WarpShape::kStrided;
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+ private:
+  //
+  // Data members
+  //
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+  /// Extent of tensor
+  TensorCoord extent_;
+  /// pointer offset of each thread
+  TensorCoord thread_offset_;
+  /// iteration index
+  LongIndex iteration_;
+  /// residual access
+  bool is_residual_;
+  /// residual offset of each thread
+  TensorCoord residual_offset_;
+ public:
+  /// Constructs a vector access iterator
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+    /// Pointer to the start of the vector
+    ConstPointer pointer,
+    /// Extent of vector
+    TensorCoord extent,
+    /// ID of each participating thread
+    int thread_id,
+    /// ID of each participating warp
+    int warp_id,
+    /// Initial offset of threadblock
+    TensorCoord const &threadblock_offset)
+    : pointer_(reinterpret_cast<BytePointer>(
+                       const_cast<NonConstPointer>(pointer))),
+      extent_(extent),
+      is_residual_(false) {
+    int warp_offset = (warp_id / kWarpCountStrided) * WarpShape::kContiguous;
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + TensorCoord(warp_offset, 0) +
+        TensorCoord((thread_id & kThreadsPerRowMask) * kElementsPerAccess, 0);
+    set_iteration_index(0);
+    if(EnableResidualAccess) {
+      // compute residual offset
+      typename TensorCoord::Index residual_size = extent_.contiguous() % WarpShape::kContiguous;
+      if (residual_size) {
+        is_residual_ = true;
+        residual_offset_ = make_Coord(residual_size, 0);
+      }
+    }
+  }
+  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+    /// Pointer to start of vector
+    ConstPointer pointer,
+    /// Extent of vector
+    TensorCoord extent,
+    ///< ID of each participating thread
+    int thread_id,
+    /// ID of each participating warp
+    int warp_id)
+    : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id,
+                                     make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_ = index;
+  }
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    thread_offset_ =
+        thread_offset_ +
+        TensorCoord(WarpShape::kContiguous * tile_offset.contiguous(), 0);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        ((thread_offset_.contiguous() + iteration_ * kThreadsPerRow * kElementsPerAccess)
+        * sizeof_bits<Element>::value / 8));
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator &operator++() {
+    ++iteration_;
+    if(iteration_ >= kIterations)
+      iteration_ = 0;
+    return *this;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    if(EnableResidualAccess && is_residual_) {
+      is_residual_ = false;
+      thread_offset_ += residual_offset_;
+    }
+    else
+      add_tile_offset(TensorCoord(1, 0));
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator operator++(int) {
+    PredicatedVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return ((thread_offset_.contiguous() +
+              iteration_ * kThreadsPerRow * kElementsPerAccess) < extent_.contiguous());
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization of PredicatedVectorAccessIterator for row-major data.
+///
+template <
+  typename Shape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator<
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::RowMajor,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
+ public:
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+  using UnderlyingIterator = PredicatedVectorAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
+      Element,
+      layout::PitchLinear,
+      ElementsPerAccess,
+      EnableResidualAccess>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  static int const kRowsPerIteration = UnderlyingIterator::kRowsPerIteration;
+  static int const kThreads = UnderlyingIterator::kThreads;
+  static int const kIterations = UnderlyingIterator::kIterations;
+ private:
+  //
+  // Data members
+  //
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+      ///< Pointer to the start of the vector
+      ConstPointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< ID of each participating warp
+      int warp_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(pointer, layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id, warp_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+      ConstPointer pointer,   ///< Pointer to the start of the vector
+      TensorCoord extent,     ///< Extent of tensor
+      int thread_id,          ///< ID of each participating thread
+      int warp_id             ///< ID of each participating warp
+      )
+      : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id,
+                                        make_Coord(0, 0)) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator operator++(int) {
+    PredicatedVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    iterator_.advance();
+  }
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h ADDED Viewed

	@@ -0,0 +1,253 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of small
+   scale and bias vectors in the shared memory.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// RegularScaleBiasVectorAccessIterator
+///
+template <typename Shape, typename Element, typename Layout>
+class RegularScaleBiasVectorAccessIterator;
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_>
+class RegularScaleBiasVectorAccessIterator<Shape_, Element_, layout::PitchLinear> {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Element type per access
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = Shape::kContiguous / kElementsPerAccess;
+  using AccessType = Array<Element, kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Internal pointer
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator(
+      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
+                                 ///< vector
+      int thread_id              ///< ID of each participating thread
+      )
+      : byte_offset_(0) {
+    // Per-thread offset in logical coordinates of tensor
+    int thread_offset = thread_id * kElementsPerAccess;
+    // initialize pointer
+    pointer_ =
+        reinterpret_cast<AccessType *>(scale_bias_ref.data() + thread_offset);
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(pointer_);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator &operator++() { return *this; }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator operator++(int) {
+    RegularScaleBiasVectorAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    // Multiply by 2 because we store scale and bias belong to the same stage
+    // next to each other.
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous * 2);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_>
+class RegularScaleBiasVectorAccessIterator<
+    Shape_, Element_,
+    layout::RowMajor> {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator(
+      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
+                                 ///< vector
+      int thread_id              ///< ID of each participating thread
+      )
+      : iterator_({scale_bias_ref.data(), scale_bias_ref.stride()}, thread_id) {
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator operator++(int) {
+    RegularScaleBiasVectorAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h ADDED Viewed

	@@ -0,0 +1,58 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing the address computation of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap,
+          int Alignment =
+              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8>
+class RegularTileAccessIterator;
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h ADDED Viewed

	@@ -0,0 +1,408 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_;
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           ThreadMap::kElementsPerAccess);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for column major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajor,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::RowMajor,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h ADDED Viewed

	@@ -0,0 +1,587 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap,
+           bool Dynamic_iterations = false,
+          int Alignment =
+              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8
+          >
+class RegularTileAccessIteratorDirectConv;
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations OFF
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_, false, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    //Do nothing
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_;
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * ThreadMap::Iterations::kStrided *
+                           ThreadMap::Delta::kStrided * stride_ * ThreadMap::kElementsPerAccess);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations ON
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_,true, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+  /// Total iterattions in the strided dimension: Dynamic value
+  int total_iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    total_iteration_strided_ = num;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_;
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < total_iteration_strided_) {
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * total_iteration_strided_ * ThreadMap::Delta::kStrided * stride_ *
+                           ThreadMap::kElementsPerAccess);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for column major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment >
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::ColumnMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations , Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap_,
+      Dynamic_iterations>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::RowMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap_,
+      Dynamic_iterations>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h ADDED Viewed

	@@ -0,0 +1,821 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                          Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  static int const kCrosswise = Crosswise;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+    ///< Number of pointers
+    static int const kPointerCount =
+        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(
+          ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous * Layout::kFactor +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           Layout::kElementsPerAccess / Layout::kFactor);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                                  Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::TensorOpMultiplicandCrosswise<
+                                    sizeof_bits<Element_>::value, Crosswise>,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  static int const kCrosswise = Crosswise;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  static_assert(!(ThreadMap::Delta::kContiguous % kCrosswise),
+                "kCrosswise is the smallest unit in the contiguous dimension "
+                "for shared memory swizzling.");
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+    /// Number of pointers
+    ///
+    /// Note:TN kblock32 layouts only needs 1 pointer, but strangely
+    /// reducing pointer count hurts perfomrnace
+    static int const kPointerCount =
+        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Total number of sections.  The memory is divided into stages.  One stage
+  /// can store one tile.  Stage is divided into sections.  Interleaved layout
+  /// can have multiple sections in a stage.  The rest layout only has one section
+  /// in a stage.
+  int sections_;
+  /// Sections that a stage has
+  int sections_per_stage_;
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : sections_(ref.stride(0) / kCrosswise),
+        sections_per_stage_(Shape::kContiguous / kCrosswise),
+        // stride_ = kCrosswise x sections_ x kFactor
+        stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(ref.data()) +
+                    ref.offset(thread_offset_in_threadblock_tile) /
+                        Layout::kElementsPerAccess;
+    }
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+    int access_offset =
+        stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
+        // kCrosswise elements in the contiguous dimension would span to a
+        // shared memory cache line.
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous / kCrosswise) *
+            Layout::TileShape::kContiguous;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next section.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
+                           ThreadMap::kElementsPerAccess / sections_ +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           Layout::kElementsPerAccess / Layout::kFactor);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCrosswise<
+                                    sizeof_bits<Element_>::value, Crosswise>,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////

build/torch29-cxx11-cu129-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h ADDED Viewed

	@@ -0,0 +1,1532 @@

+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+#pragma once
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  static_assert(ThreadMap::kThreads / 32 > 1,
+    "This tile iterator requires at least two warps.");
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_;
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous +
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous64b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  static_assert(ThreadMap::kThreads / 32 > 1,
+    "This tile iterator requires at least two warps.");
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+    ///< Number of pointers - two pointers are needed if making more than 4 iterations along
+    ///< strided dimension
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 4 ? 2 : 1);
+  };
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_[Detail::kPointerCount];
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    stride_(ref.stride(0) / ThreadMap::kElementsPerAccess) {
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data());
+    byte_offset_[0] = ref.offset(thread_offset_in_threadblock_tile) * sizeof(Element);
+    if (Detail::kPointerCount == 2) {
+      byte_offset_[1] = byte_offset_[0] ^ 8;
+    }
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset / ThreadMap::kElementsPerAccess;
+  }
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+    // Map the logical contiguous and strided access to the internal swizzled structure.
+    int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16 + stride_ * ThreadMap::Delta::kContiguous * iteration_contiguous_;
+    char *access_byte_ptr = reinterpret_cast<char *>(pointer_ + uniform_offset);
+    int byte_offset;
+    // This iterator may require two byte offsets if it must load more than 8 rows (or 2 iterations)
+    // in the strided dimension
+    if (Detail::kPointerCount == 2 && (iteration_strided_ & 0x4)) {
+      byte_offset = byte_offset_[1];
+    }
+    else {
+      byte_offset = byte_offset_[0];
+    }
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.strided() * Shape::kStrided + coord.contiguous() * Shape::kContiguous * stride_);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicand64bCrosswise,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  static_assert(ThreadMap::kThreads / 32 > 1,
+    "This tile iterator requires at least two warps.");
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_;
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous +
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous128b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  static_assert(ThreadMap::kThreads / 32 > 1,
+    "This tile iterator requires at least two warps.");
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+  static_assert(!(ThreadMap::Iterations::kStrided % 2), "This iterator requires at least two iterations along the strided dimension");
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+ private:
+  //
+  // Data members
+  //
+  /// Stride value
+  StrideIndex stride_;
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+  /// Internal byte offset
+  Index byte_offset_;
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    set_iteration_index(0);
+  }
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_;
+    int offset_c = (iteration_contiguous_ * ThreadMap::Delta::kContiguous + (iteration_strided_ & 1) * 2);
+    int offset_s = (iteration_strided_ / 2) * 8;
+    int access_offset = offset_c * stride_ + offset_s;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+    return prev;
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous * stride_ +
+      coord.strided() * Shape::kStrided * Layout::kElementsPerAccess);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCrosswise128x4,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+  using AccessType = typename UnderlyingIterator::AccessType;
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+    return prev;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////