// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_wmma_cshuffle_v3.hpp"

#include "ck/library/tensor_operation_instance/add_grouped_conv_bwd_wei_exp_device_operation_instance.hpp"

namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {

using namespace ck::tensor_layout::convolution;

using BF16 = bhalf_t;
using F16  = half_t;
using F32  = float;

using Row = tensor_layout::gemm::RowMajor;
using Col = tensor_layout::gemm::ColumnMajor;

template <index_t... Is>
using S = Sequence<Is...>;

using PassThrough = element_wise::PassThrough;

static constexpr auto GemmDefault    = GemmSpecialization::Default;
static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
static constexpr auto GemmMKPadding  = GemmSpecialization::MKPadding;
static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;

static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;

template <typename InOutDataType>
using device_gemm_wmma_universal_km_kn_mn_GemmDefault_instances = std::tuple<
    // clang-format off
    //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|                          BlockwiseGemm|                BlockwiseGemm|
    //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|                               Pipeline|                     Pipeline|
    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |                              Scheduler|                     Verision|
    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                                       |                             |
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    48,    96,    64,   8,   8,   16,   16,       3,       3,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    96,    32,   8,   8,   16,   16,       4,       3,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         1,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    32,    64,   128,   8,   8,   16,   16,       2,       1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,                      S<1, 16, 1, 8>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    32,   8,   8,   16,   16,       4,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    96,    32,   8,   8,   16,   16,       4,       3,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,   192,    48,    96,   192,   8,   8,   16,   16,       3,       1,     S<24, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,     S<24, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                     S<1, 16, 1, 12>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    48,    64,    64,   8,   8,   16,   16,       3,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,   128,    96,   128,    64,   8,   8,   16,   16,       6,       2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         0,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 8>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,   192,    32,    96,   192,   8,   8,   16,   16,       2,       1,     S<24, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<24, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         1,           1,           1,                     S<1, 16, 1, 12>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    32,    96,    64,   8,   8,   16,   16,       2,       3,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    32,    32,   128,   8,   8,   16,   16,       2,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,   192,    32,    96,   192,   8,   8,   16,   16,       2,       1,     S<24, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<24, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                     S<1, 16, 1, 12>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    64,    64,    32,   8,   8,   16,   16,       4,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>, InOutDataType,           F32, InOutDataType, PassThrough, PassThrough, PassThrough,    GemmDefault,    64,    32,    32,    64,   8,   8,   16,   16,       2,       1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
    // clang-format on
    >;

using device_gemm_wmma_universal_km_kn_mn_GemmMNKPadding_f16_instances = std::tuple<
    // clang-format off
    //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|                          BlockwiseGemm|                BlockwiseGemm|
    //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|                               Pipeline|                     Pipeline|
    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |                              Scheduler|                     Verision|
    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                                       |                             |
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    96,    32,   8,   8,   16,   16,       4,       3,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         1,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    64,    64,   8,   8,   16,   16,       4,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   128,    48,    64,   128,   8,   8,   16,   16,       3,       1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 8>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    64,    64,   8,   8,   16,   16,       4,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    96,    32,   8,   8,   16,   16,       4,       3,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    48,    64,    64,   8,   8,   16,   16,       3,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    96,    64,    32,   8,   8,   16,   16,       6,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    32,   128,   8,   8,   16,   16,       2,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         0,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    32,   128,   8,   8,   16,   16,       2,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    48,    32,   128,   8,   8,   16,   16,       3,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    96,    64,   8,   8,   16,   16,       2,       3,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    // DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    96,    64,    96,    48,   8,   8,   16,   16,       4,       2,     S<6, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<6, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 6>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>, // Incorrect results for f16
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,           F16,           F16,    Tuple<>,           F16,           F32,           F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    32,   128,   8,   8,   16,   16,       2,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
    // clang-format on
    >;

using device_gemm_wmma_universal_km_kn_mn_GemmMNKPadding_bf16_instances = std::tuple<
    // clang-format off
    //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|                          BlockwiseGemm|                BlockwiseGemm|
    //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|                               Pipeline|                     Pipeline|
    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |                              Scheduler|                     Verision|
    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                                       |                             |
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    96,    32,   8,   8,   16,   16,       4,       3,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         1,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    64,    64,   8,   8,   16,   16,       4,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   128,    48,    64,   128,   8,   8,   16,   16,       3,       1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 8>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    64,    64,   8,   8,   16,   16,       4,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    64,    96,    32,   8,   8,   16,   16,       4,       3,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    48,    64,    64,   8,   8,   16,   16,       3,       2,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    96,    64,    32,   8,   8,   16,   16,       6,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    32,   128,   8,   8,   16,   16,       2,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,         0,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    32,   128,   8,   8,   16,   16,       2,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    48,    32,   128,   8,   8,   16,   16,       3,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              6,              8,         1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    96,    64,   8,   8,   16,   16,       2,       3,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,      S<8, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    96,    64,    96,    48,   8,   8,   16,   16,       4,       2,     S<6, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<6, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              6,              8,         0,           1,           1,                      S<1, 16, 1, 6>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>, // Incorrect results for f16
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row,          BF16,          BF16,    Tuple<>,          BF16,           F32,          BF16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,    64,    32,    32,   128,   8,   8,   16,   16,       2,       1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         0,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         0,           1,           1,                      S<1, 16, 1, 4>,             S<8, 8, 8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>
    // clang-format on
    >;

template <typename InOutDataType, BlockGemmPipelineScheduler BlkGemmPipeSched>
using device_gemm_wmma_universal_km_kn_mn_irregular_odd_mn_instances = std::tuple<
    // clang-format off
    //#####################################| ALayout| BLayout| DsLayout |ELayout|     ADataType|     BDataType| DsDataType|     CDataType|   AccDataType|      CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CShuffleBlockTransferClusterLengths|  CShuffleBlockTransfer|     BlockwiseGemm|                BlockwiseGemm|
    //#####################################|        |        |          |       |              |              |           |              |              |      DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|     MRepeat|     NRepeat|  _MBlock_MPerBlock_NBlock_NPerBlock|       ScalarPerVectors|          Pipeline|                     Pipeline|
    //#####################################|        |        |          |       |              |              |           |              |              |              |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                                    |                       |         Scheduler|                     Verision|
    //#####################################|        |        |          |       |              |              |           |              |              |              |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                    |                       |                  |                             |
    // Latency friendly
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    32,    64,   8,   8,   16,   16,       1,       2,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         0,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    32,    64,   8,   8,   16,   16,       1,       2,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    32,   128,   8,   8,   16,   16,       1,       2,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    48,   128,   8,   8,   16,   16,       1,       3,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    64,    32,   8,   8,   16,   16,       1,       4,    S<4,  64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<4,  64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    64,    32,   8,   8,   16,   16,       1,       4,    S<4,  64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,    S<4,  64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    64,    64,   8,   8,   16,   16,       1,       4,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    64,   128,   8,   8,   16,   16,       1,       4,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    96,    64,   8,   8,   16,   16,       1,       6,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,    96,   128,   8,   8,   16,   16,       1,       6,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   128,   192,    32,   8,   8,   16,   16,       1,       12,   S<4,  64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<4,  64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
    DeviceBatchedGemmMultiD_Wmma_CShuffleV3<     Col,     Row,   Tuple<>,    Row, InOutDataType, InOutDataType,    Tuple<>,           F32,           F32, InOutDataType, PassThrough, PassThrough, PassThrough, GemmMNKPadding,   256,   256,    96,    64,   8,   8,   16,   16,       2,       6,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,    S<8,  32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,                     S<1, 16, 1, 16>,             S<1, 1, 1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>
    // Memory friendly
    // TODO: add once v2 is implemented
    // clang-format on
    >;

} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
