//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

// MIT License
//
// Modifications Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#ifndef _CUDA_BARRIER
#define _CUDA_BARRIER

// NOTE(HIP/AMD): barrier is not supported on AMD hardware
#ifdef __HIP_PLATFORM_AMD__
#error barrier is not supported on AMD hardware and should not be included
#endif

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cuda/ptx>
#include <cuda/std/barrier>

// Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
// functions. These functions take a pointer to CUtensorMap, so do not need to
// know its size. This type is defined in cuda.h (driver API) as:
//
//     typedef struct CUtensorMap_st {  [ .. snip .. ] } CUtensorMap;
//
// We need to forward-declare both CUtensorMap_st (the struct) and CUtensorMap
// (the typedef):
struct CUtensorMap_st;
typedef struct CUtensorMap_st CUtensorMap;

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL

// Experimental exposure of TMA PTX:
//
// - cp_async_bulk_global_to_shared
// - cp_async_bulk_shared_to_global
// - cp_async_bulk_tensor_{1,2,3,4,5}d_global_to_shared
// - cp_async_bulk_tensor_{1,2,3,4,5}d_shared_to_global
// - fence_proxy_async_shared_cta
// - cp_async_bulk_commit_group
// - cp_async_bulk_wait_group_read<0, …, 7>

// These PTX wrappers are only available when the code is compiled compute
// capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
// necessary to prevent cudafe from ripping out the device functions before
// device compilation begins.
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
  void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");

  _CUDA_VPTX::cp_async_bulk(
    _CUDA_VPTX::space_cluster,
    _CUDA_VPTX::space_global,
    __dest,
    __src,
    __size,
    ::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size)
{
  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");

  _CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
  void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0};

  _CUDA_VPTX::cp_async_bulk_tensor(
    _CUDA_VPTX::space_cluster,
    _CUDA_VPTX::space_global,
    __dest,
    __tensor_map,
    __coords,
    ::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
  void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

  _CUDA_VPTX::cp_async_bulk_tensor(
    _CUDA_VPTX::space_cluster,
    _CUDA_VPTX::space_global,
    __dest,
    __tensor_map,
    __coords,
    ::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
  void* __dest,
  const CUtensorMap* __tensor_map,
  int __c0,
  int __c1,
  int __c2,
  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

  _CUDA_VPTX::cp_async_bulk_tensor(
    _CUDA_VPTX::space_cluster,
    _CUDA_VPTX::space_global,
    __dest,
    __tensor_map,
    __coords,
    ::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
  void* __dest,
  const CUtensorMap* __tensor_map,
  int __c0,
  int __c1,
  int __c2,
  int __c3,
  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

  _CUDA_VPTX::cp_async_bulk_tensor(
    _CUDA_VPTX::space_cluster,
    _CUDA_VPTX::space_global,
    __dest,
    __tensor_map,
    __coords,
    ::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
  void* __dest,
  const CUtensorMap* __tensor_map,
  int __c0,
  int __c1,
  int __c2,
  int __c3,
  int __c4,
  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

  _CUDA_VPTX::cp_async_bulk_tensor(
    _CUDA_VPTX::space_cluster,
    _CUDA_VPTX::space_global,
    __dest,
    __tensor_map,
    __coords,
    ::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void
cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0};

  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void
cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
{
  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
{
  _CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
inline _CCCL_DEVICE void cp_async_bulk_commit_group()
{
  _CUDA_VPTX::cp_async_bulk_commit_group();
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
template <int __n_prior>
inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
{
  static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
  _CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{});
}

#endif // __cccl_lib_experimental_ctk12_cp_async_exposure

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL

#endif // _CUDA_BARRIER
