1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
13#if defined(AMREX_USE_CUDA)
15#if defined(CCCL_MAJOR_VERSION) && defined(CCCL_MINOR_VERSION) && ((CCCL_MAJOR_VERSION >= 3) || ((CCCL_MAJOR_VERSION >= 2) && (CCCL_MINOR_VERSION >= 8)))
16#define AMREX_CUDA_CCCL_VER_GE_2_8 1
18#elif defined(AMREX_USE_HIP)
19#include <rocprim/rocprim.hpp>
28 void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept;
32 void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept;
36 void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept;
53template <
int warpSize,
typename T,
typename F>
57 T
operator() (T
x, sycl::sub_group
const& sg)
const noexcept
60 T
y = sycl::shift_group_left(sg,
x,
offset);
67template <
int warpSize,
typename T,
typename WARPREDUCE>
69T
blockReduce (T
x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler
const& h)
71 T* shared = (T*)h.local;
72 int tid = h.item->get_local_id(0);
73 sycl::sub_group
const& sg = h.item->get_sub_group();
74 int lane = sg.get_local_id()[0];
75 int wid = sg.get_group_id()[0];
76 int numwarps = sg.get_group_range()[0];
77 x = warp_reduce(
x, sg);
82 h.item->barrier(sycl::access::fence_space::local_space);
83 if (lane == 0) { shared[wid] =
x; }
84 h.item->barrier(sycl::access::fence_space::local_space);
85 bool b = (tid == 0) || (tid < numwarps);
86 x = b ? shared[lane] : x0;
87 if (wid == 0) {
x = warp_reduce(
x, sg); }
91template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
94 Gpu::Handler
const& handler)
96 sycl::sub_group
const& sg = handler.item->get_sub_group();
97 int wid = sg.get_group_id()[0];
98 if ((wid+1)*warpSize <= handler.numActiveThreads) {
99 x = warp_reduce(
x, sg);
100 if (sg.get_local_id()[0] == 0) { atomic_op(dest,
x); }
111 return Gpu::blockReduce<Gpu::Device::warp_size>
128 return Gpu::blockReduce<Gpu::Device::warp_size>
145 return Gpu::blockReduce<Gpu::Device::warp_size>
161 return Gpu::blockReduce<Gpu::Device::warp_size>
176 return Gpu::blockReduce<Gpu::Device::warp_size>
189void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept
191 if (h.isFullBlock()) {
194 Gpu::blockReduce_partial<Gpu::Device::warp_size>
196 Gpu::AtomicAdd<T>(), h);
202void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept
204 if (h.isFullBlock()) {
207 Gpu::blockReduce_partial<Gpu::Device::warp_size>
209 Gpu::AtomicMin<T>(), h);
215void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept
217 if (h.isFullBlock()) {
220 Gpu::blockReduce_partial<Gpu::Device::warp_size>
222 Gpu::AtomicMax<T>(), h);
229 if (h.isFullBlock()) {
232 Gpu::blockReduce_partial<Gpu::Device::warp_size>
234 Gpu::AtomicLogicalAnd<int>(), h);
241 if (h.isFullBlock()) {
244 Gpu::blockReduce_partial<Gpu::Device::warp_size>
246 Gpu::AtomicLogicalOr<int>(), h);
250#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
257T shuffle_down (T x,
int offset)
noexcept
260 __shfl_down_sync(0xffffffff, x,
offset));
264template <
class T, std::enable_if_t<sizeof(T)%sizeof(
unsigned int) == 0,
int> = 0>
266T multi_shuffle_down (T x,
int offset)
noexcept
268 constexpr int nwords = (
sizeof(T) +
sizeof(
unsigned int) - 1) /
sizeof(
unsigned int);
270 auto py =
reinterpret_cast<unsigned int*
>(&
y);
271 auto px =
reinterpret_cast<unsigned int*
>(&
x);
272 for (
int i = 0; i < nwords; ++i) {
273 py[i] = shuffle_down(px[i],
offset);
281template <
int warpSize,
typename T,
typename F>
285 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,
int> = 0>
290 T
y = detail::shuffle_down(
x,
offset);
296 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,
int> = 0>
301 T
y = detail::multi_shuffle_down(
x,
offset);
308template <
int warpSize,
typename T,
typename WARPREDUCE>
312 __shared__ T shared[warpSize];
313 int lane = threadIdx.x % warpSize;
314 int wid = threadIdx.x / warpSize;
321 if (lane == 0) { shared[wid] =
x; }
323 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
324 x = b ? shared[lane] : x0;
325 if (wid == 0) {
x = warp_reduce(
x); }
329template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
334 int warp = (
int)threadIdx.x / warpSize;
337 if (threadIdx.x % warpSize == 0) { atomic_op(dest,
x); }
348 return Gpu::blockReduce<Gpu::Device::warp_size>
361template <
int BLOCKDIMX,
typename T>
365#if defined(AMREX_USE_CUDA)
366 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
367 __shared__
typename BlockReduce::TempStorage temp_storage;
373 return BlockReduce(temp_storage).Sum(source);
374#elif defined(AMREX_USE_HIP)
375 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
376 __shared__
typename BlockReduce::storage_type temp_storage;
378 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
385template <
int BLOCKDIMX,
typename T>
389 T aggregate = blockReduceSum<BLOCKDIMX>(source);
398 return Gpu::blockReduce<Gpu::Device::warp_size>
411template <
int BLOCKDIMX,
typename T>
415#if defined(AMREX_USE_CUDA)
416 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
417 __shared__
typename BlockReduce::TempStorage temp_storage;
424#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
425 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
427 return BlockReduce(temp_storage).Reduce(source, cub::Min());
429#elif defined(AMREX_USE_HIP)
430 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
431 __shared__
typename BlockReduce::storage_type temp_storage;
433 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
440template <
int BLOCKDIMX,
typename T>
444 T aggregate = blockReduceMin<BLOCKDIMX>(source);
453 return Gpu::blockReduce<Gpu::Device::warp_size>
466template <
int BLOCKDIMX,
typename T>
470#if defined(AMREX_USE_CUDA)
471 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
472 __shared__
typename BlockReduce::TempStorage temp_storage;
478#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
479 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
481 return BlockReduce(temp_storage).Reduce(source, cub::Max());
483#elif defined(AMREX_USE_HIP)
484 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
485 __shared__
typename BlockReduce::storage_type temp_storage;
487 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
494template <
int BLOCKDIMX,
typename T>
498 T aggregate = blockReduceMax<BLOCKDIMX>(source);
506 return Gpu::blockReduce<Gpu::Device::warp_size>
518template <
int BLOCKDIMX>
522#if defined(AMREX_USE_CUDA)
523 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
524 __shared__
typename BlockReduce::TempStorage temp_storage;
531#elif defined(AMREX_USE_HIP)
532 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
533 __shared__
typename BlockReduce::storage_type temp_storage;
542template <
int BLOCKDIMX>
546 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
554 return Gpu::blockReduce<Gpu::Device::warp_size>
566template <
int BLOCKDIMX>
570#if defined(AMREX_USE_CUDA)
571 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
572 __shared__
typename BlockReduce::TempStorage temp_storage;
579#elif defined(AMREX_USE_HIP)
580 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
581 __shared__
typename BlockReduce::storage_type temp_storage;
590template <
int BLOCKDIMX>
594 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
602 if (handler.isFullBlock()) {
603 if (blockDim.x == 128) {
604 deviceReduceSum_full<128,T>(dest, source);
605 }
else if (blockDim.x == 256) {
606 deviceReduceSum_full<256,T>(dest, source);
608 deviceReduceSum_full<T>(dest, source);
611 Gpu::blockReduce_partial<Gpu::Device::warp_size>
621 if (handler.isFullBlock()) {
622 if (blockDim.x == 128) {
623 deviceReduceMin_full<128,T>(dest, source);
624 }
else if (blockDim.x == 256) {
625 deviceReduceMin_full<256,T>(dest, source);
627 deviceReduceMin_full<T>(dest, source);
630 Gpu::blockReduce_partial<Gpu::Device::warp_size>
640 if (handler.isFullBlock()) {
641 if (blockDim.x == 128) {
642 deviceReduceMax_full<128,T>(dest, source);
643 }
else if (blockDim.x == 256) {
644 deviceReduceMax_full<256,T>(dest, source);
646 deviceReduceMax_full<T>(dest, source);
649 Gpu::blockReduce_partial<Gpu::Device::warp_size>
658 if (handler.isFullBlock()) {
659 if (blockDim.x == 128) {
660 deviceReduceLogicalAnd_full<128>(dest, source);
661 }
else if (blockDim.x == 256) {
662 deviceReduceLogicalAnd_full<256>(dest, source);
667 Gpu::blockReduce_partial<Gpu::Device::warp_size>
676 if (handler.isFullBlock()) {
677 if (blockDim.x == 128) {
678 deviceReduceLogicalOr_full<128>(dest, source);
679 }
else if (blockDim.x == 256) {
680 deviceReduceLogicalOr_full<256>(dest, source);
685 Gpu::blockReduce_partial<Gpu::Device::warp_size>
715#pragma omp critical (gpureduce_reducemin)
717 *dest = std::min(*dest, source);
732#pragma omp critical (gpureduce_reducemax)
734 *dest = std::max(*dest, source);
748#pragma omp critical (gpureduce_reduceand)
750 *dest = (*dest) && source;
763#pragma omp critical (gpureduce_reduceor)
765 *dest = (*dest) || source;
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:17
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
Definition AMReX_BaseFwd.H:55
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:600
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:310
__device__ void deviceReduceLogicalAnd_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:511
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:331
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:552
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:451
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:396
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:504
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:638
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:459
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:656
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:674
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:354
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:404
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:619
__device__ void deviceReduceLogicalOr_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:559
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:346
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_GpuAtomic.H:657
Definition AMReX_GpuAtomic.H:681
Definition AMReX_GpuAtomic.H:689
Definition AMReX_GpuAtomic.H:673
Definition AMReX_GpuAtomic.H:665
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:283
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:287
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14