Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuReduce.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_GpuControl.H>
7#include <AMReX_GpuTypes.H>
8#include <AMReX_GpuAtomic.H>
9#include <AMReX_GpuUtility.H>
10#include <AMReX_Functional.H>
11#include <AMReX_TypeTraits.H>
12
13#if defined(AMREX_USE_CUDA)
14#include <cub/cub.cuh>
15#if defined(CCCL_MAJOR_VERSION) && defined(CCCL_MINOR_VERSION) && ((CCCL_MAJOR_VERSION >= 3) || ((CCCL_MAJOR_VERSION >= 2) && (CCCL_MINOR_VERSION >= 8)))
16#define AMREX_CUDA_CCCL_VER_GE_2_8 1
17#endif
18#elif defined(AMREX_USE_HIP)
19#include <rocprim/rocprim.hpp>
20#endif
21
22//
23// Public interface
24//
25namespace amrex::Gpu {
26 template <typename T>
28 void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept;
29
30 template <typename T>
32 void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept;
33
34 template <typename T>
36 void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept;
37
39 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept;
40
42 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept;
43}
44
45//
46// Reduce functions based on _shfl_down_sync
47//
48
49namespace amrex::Gpu {
50
51#ifdef AMREX_USE_SYCL
52
53template <int warpSize, typename T, typename F>
54struct warpReduce
55{
57 T operator() (T x, sycl::sub_group const& sg) const noexcept
58 {
59 for (int offset = warpSize/2; offset > 0; offset /= 2) {
60 T y = sycl::shift_group_left(sg, x, offset);
61 x = F()(x,y);
62 }
63 return x;
64 }
65};
66
67template <int warpSize, typename T, typename WARPREDUCE>
69T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
70{
71 T* shared = (T*)h.local;
72 int tid = h.item->get_local_id(0);
73 sycl::sub_group const& sg = h.item->get_sub_group();
74 int lane = sg.get_local_id()[0];
75 int wid = sg.get_group_id()[0];
76 int numwarps = sg.get_group_range()[0];
77 x = warp_reduce(x, sg);
78 // __syncthreads() prior to writing to shared memory is necessary
79 // if this reduction call is occurring multiple times in a kernel,
80 // and since we don't know how many times the user is calling it,
81 // we do it always to be safe.
82 h.item->barrier(sycl::access::fence_space::local_space);
83 if (lane == 0) { shared[wid] = x; }
84 h.item->barrier(sycl::access::fence_space::local_space);
85 bool b = (tid == 0) || (tid < numwarps);
86 x = b ? shared[lane] : x0;
87 if (wid == 0) { x = warp_reduce(x, sg); }
88 return x;
89}
90
91template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
93void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
94 Gpu::Handler const& handler)
95{
96 sycl::sub_group const& sg = handler.item->get_sub_group();
97 int wid = sg.get_group_id()[0];
98 if ((wid+1)*warpSize <= handler.numActiveThreads) {
99 x = warp_reduce(x, sg); // full warp
100 if (sg.get_local_id()[0] == 0) { atomic_op(dest, x); }
101 } else {
102 atomic_op(dest, x);
103 }
104}
105
106// block-wide reduction for thread0
107template <typename T>
109T blockReduceSum (T source, Gpu::Handler const& h) noexcept
110{
111 return Gpu::blockReduce<Gpu::Device::warp_size>
112 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0, h);
113}
114
115template <typename T>
117void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept
118{
119 T aggregate = blockReduceSum(source, h);
120 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
121}
122
123// block-wide reduction for thread0
124template <typename T>
126T blockReduceMin (T source, Gpu::Handler const& h) noexcept
127{
128 return Gpu::blockReduce<Gpu::Device::warp_size>
129 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(), source, h);
130}
131
132template <typename T>
134void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept
135{
136 T aggregate = blockReduceMin(source, h);
137 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Min(dest, aggregate); }
138}
139
140// block-wide reduction for thread0
141template <typename T>
143T blockReduceMax (T source, Gpu::Handler const& h) noexcept
144{
145 return Gpu::blockReduce<Gpu::Device::warp_size>
146 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(), source, h);
147}
148
149template <typename T>
151void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept
152{
153 T aggregate = blockReduceMax(source, h);
154 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Max(dest, aggregate); }
155}
156
157// block-wide reduction for thread0
159int blockReduceLogicalAnd (int source, Gpu::Handler const& h) noexcept
160{
161 return Gpu::blockReduce<Gpu::Device::warp_size>
162 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1, h);
163}
164
166void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept
167{
168 int aggregate = blockReduceLogicalAnd(source, h);
169 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
170}
171
172// block-wide reduction for thread0
174int blockReduceLogicalOr (int source, Gpu::Handler const& h) noexcept
175{
176 return Gpu::blockReduce<Gpu::Device::warp_size>
177 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0, h);
178}
179
181void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept
182{
183 int aggregate = blockReduceLogicalOr(source, h);
184 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
185}
186
187template <typename T>
189void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
190{
191 if (h.isFullBlock()) {
192 deviceReduceSum_full(dest, source, h);
193 } else {
194 Gpu::blockReduce_partial<Gpu::Device::warp_size>
195 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
196 Gpu::AtomicAdd<T>(), h);
197 }
198}
199
200template <typename T>
202void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
203{
204 if (h.isFullBlock()) {
205 deviceReduceMin_full(dest, source, h);
206 } else {
207 Gpu::blockReduce_partial<Gpu::Device::warp_size>
208 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(),
209 Gpu::AtomicMin<T>(), h);
210 }
211}
212
213template <typename T>
215void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
216{
217 if (h.isFullBlock()) {
218 deviceReduceMax_full(dest, source, h);
219 } else {
220 Gpu::blockReduce_partial<Gpu::Device::warp_size>
221 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(),
222 Gpu::AtomicMax<T>(), h);
223 }
224}
225
227void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
228{
229 if (h.isFullBlock()) {
230 deviceReduceLogicalAnd_full(dest, source, h);
231 } else {
232 Gpu::blockReduce_partial<Gpu::Device::warp_size>
233 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
234 Gpu::AtomicLogicalAnd<int>(), h);
235 }
236}
237
239void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
240{
241 if (h.isFullBlock()) {
242 deviceReduceLogicalOr_full(dest, source, h);
243 } else {
244 Gpu::blockReduce_partial<Gpu::Device::warp_size>
245 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
246 Gpu::AtomicLogicalOr<int>(), h);
247 }
248}
249
250#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
251
253namespace detail {
254
255template <typename T>
257T shuffle_down (T x, int offset) noexcept
258{
259 return AMREX_HIP_OR_CUDA(__shfl_down(x, offset),
260 __shfl_down_sync(0xffffffff, x, offset));
261}
262
263// If other sizeof is needed, we can implement it later.
264template <class T, std::enable_if_t<sizeof(T)%sizeof(unsigned int) == 0, int> = 0>
266T multi_shuffle_down (T x, int offset) noexcept
267{
268 constexpr int nwords = (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
269 T y;
270 auto py = reinterpret_cast<unsigned int*>(&y);
271 auto px = reinterpret_cast<unsigned int*>(&x);
272 for (int i = 0; i < nwords; ++i) {
273 py[i] = shuffle_down(px[i],offset);
274 }
275 return y;
276}
277
278}
280
281template <int warpSize, typename T, typename F>
283{
284 // Not all arithmetic types can be taken by shuffle_down, but it's good enough.
285 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,int> = 0>
287 T operator() (T x) const noexcept
288 {
289 for (int offset = warpSize/2; offset > 0; offset /= 2) {
290 T y = detail::shuffle_down(x, offset);
291 x = F()(x,y);
292 }
293 return x;
294 }
295
296 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,int> = 0>
298 T operator() (T x) const noexcept
299 {
300 for (int offset = warpSize/2; offset > 0; offset /= 2) {
301 T y = detail::multi_shuffle_down(x, offset);
302 x = F()(x,y);
303 }
304 return x;
305 }
306};
307
308template <int warpSize, typename T, typename WARPREDUCE>
310T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
311{
312 __shared__ T shared[warpSize];
313 int lane = threadIdx.x % warpSize;
314 int wid = threadIdx.x / warpSize;
315 x = warp_reduce(x);
316 // __syncthreads() prior to writing to shared memory is necessary
317 // if this reduction call is occurring multiple times in a kernel,
318 // and since we don't know how many times the user is calling it,
319 // we do it always to be safe.
320 __syncthreads();
321 if (lane == 0) { shared[wid] = x; }
322 __syncthreads();
323 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
324 x = b ? shared[lane] : x0;
325 if (wid == 0) { x = warp_reduce(x); }
326 return x;
327}
328
329template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
331void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
332 Gpu::Handler const& handler)
333{
334 int warp = (int)threadIdx.x / warpSize;
335 if ((warp+1)*warpSize <= handler.numActiveThreads) {
336 x = warp_reduce(x); // full warp
337 if (threadIdx.x % warpSize == 0) { atomic_op(dest, x); }
338 } else {
339 atomic_op(dest,x);
340 }
341}
342
343// Compute the block-wide reduction for thread0
344template <typename T>
346T blockReduceSum (T source) noexcept
347{
348 return Gpu::blockReduce<Gpu::Device::warp_size>
350}
351
352template <typename T>
354void deviceReduceSum_full (T * dest, T source) noexcept
355{
356 T aggregate = blockReduceSum(source);
357 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
358}
359
360// Compute the block-wide reduction for thread0
361template <int BLOCKDIMX, typename T>
363T blockReduceSum (T source) noexcept
364{
365#if defined(AMREX_USE_CUDA)
366 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
367 __shared__ typename BlockReduce::TempStorage temp_storage;
368 // __syncthreads() prior to writing to shared memory is necessary
369 // if this reduction call is occurring multiple times in a kernel,
370 // and since we don't know how many times the user is calling it,
371 // we do it always to be safe.
372 __syncthreads();
373 return BlockReduce(temp_storage).Sum(source);
374#elif defined(AMREX_USE_HIP)
375 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
376 __shared__ typename BlockReduce::storage_type temp_storage;
377 __syncthreads();
378 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
379 return source;
380#else
381 return blockReduceSum(source);
382#endif
383}
384
385template <int BLOCKDIMX, typename T>
387void deviceReduceSum_full (T * dest, T source) noexcept
388{
389 T aggregate = blockReduceSum<BLOCKDIMX>(source);
390 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
391}
392
393// Compute the block-wide reduction for thread0
394template <typename T>
396T blockReduceMin (T source) noexcept
397{
398 return Gpu::blockReduce<Gpu::Device::warp_size>
400}
401
402template <typename T>
404void deviceReduceMin_full (T * dest, T source) noexcept
405{
406 T aggregate = blockReduceMin(source);
407 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
408}
409
410// Compute the block-wide reduction for thread0
411template <int BLOCKDIMX, typename T>
413T blockReduceMin (T source) noexcept
414{
415#if defined(AMREX_USE_CUDA)
416 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
417 __shared__ typename BlockReduce::TempStorage temp_storage;
418 // __syncthreads() prior to writing to shared memory is necessary
419 // if this reduction call is occurring multiple times in a kernel,
420 // and since we don't know how many times the user is calling it,
421 // we do it always to be safe.
422 __syncthreads();
423
424#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
425 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
426#else
427 return BlockReduce(temp_storage).Reduce(source, cub::Min());
428#endif
429#elif defined(AMREX_USE_HIP)
430 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
431 __shared__ typename BlockReduce::storage_type temp_storage;
432 __syncthreads();
433 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
434 return source;
435#else
436 return blockReduceMin(source);
437#endif
438}
439
440template <int BLOCKDIMX, typename T>
442void deviceReduceMin_full (T * dest, T source) noexcept
443{
444 T aggregate = blockReduceMin<BLOCKDIMX>(source);
445 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
446}
447
448// Compute the block-wide reduction for thread0
449template <typename T>
451T blockReduceMax (T source) noexcept
452{
453 return Gpu::blockReduce<Gpu::Device::warp_size>
455}
456
457template <typename T>
459void deviceReduceMax_full (T * dest, T source) noexcept
460{
461 T aggregate = blockReduceMax(source);
462 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
463}
464
465// Compute the block-wide reduction for thread0
466template <int BLOCKDIMX, typename T>
468T blockReduceMax (T source) noexcept
469{
470#if defined(AMREX_USE_CUDA)
471 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
472 __shared__ typename BlockReduce::TempStorage temp_storage;
473 // __syncthreads() prior to writing to shared memory is necessary
474 // if this reduction call is occurring multiple times in a kernel,
475 // and since we don't know how many times the user is calling it,
476 // we do it always to be safe.
477 __syncthreads();
478#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
479 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
480#else
481 return BlockReduce(temp_storage).Reduce(source, cub::Max());
482#endif
483#elif defined(AMREX_USE_HIP)
484 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
485 __shared__ typename BlockReduce::storage_type temp_storage;
486 __syncthreads();
487 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
488 return source;
489#else
490 return blockReduceMax(source);
491#endif
492}
493
494template <int BLOCKDIMX, typename T>
496void deviceReduceMax_full (T * dest, T source) noexcept
497{
498 T aggregate = blockReduceMax<BLOCKDIMX>(source);
499 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
500}
501
502// Compute the block-wide reduction for thread0
504int blockReduceLogicalAnd (int source) noexcept
505{
506 return Gpu::blockReduce<Gpu::Device::warp_size>
508}
509
511void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
512{
513 int aggregate = blockReduceLogicalAnd(source);
514 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
515}
516
517// Compute the block-wide reduction for thread0
518template <int BLOCKDIMX>
520int blockReduceLogicalAnd (int source) noexcept
521{
522#if defined(AMREX_USE_CUDA)
523 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
524 __shared__ typename BlockReduce::TempStorage temp_storage;
525 // __syncthreads() prior to writing to shared memory is necessary
526 // if this reduction call is occurring multiple times in a kernel,
527 // and since we don't know how many times the user is calling it,
528 // we do it always to be safe.
529 __syncthreads();
530 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalAnd<int>());
531#elif defined(AMREX_USE_HIP)
532 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
533 __shared__ typename BlockReduce::storage_type temp_storage;
534 __syncthreads();
535 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalAnd<int>());
536 return source;
537#else
538 return blockReduceLogicalAnd(source);
539#endif
540}
541
542template <int BLOCKDIMX>
544void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
545{
546 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
547 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
548}
549
550// Compute the block-wide reduction for thread0
552int blockReduceLogicalOr (int source) noexcept
553{
554 return Gpu::blockReduce<Gpu::Device::warp_size>
556}
557
559void deviceReduceLogicalOr_full (int * dest, int source) noexcept
560{
561 int aggregate = blockReduceLogicalOr(source);
562 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
563}
564
565// Compute the block-wide reduction for thread0
566template <int BLOCKDIMX>
568int blockReduceLogicalOr (int source) noexcept
569{
570#if defined(AMREX_USE_CUDA)
571 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
572 __shared__ typename BlockReduce::TempStorage temp_storage;
573 // __syncthreads() prior to writing to shared memory is necessary
574 // if this reduction call is occurring multiple times in a kernel,
575 // and since we don't know how many times the user is calling it,
576 // we do it always to be safe.
577 __syncthreads();
578 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalOr<int>());
579#elif defined(AMREX_USE_HIP)
580 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
581 __shared__ typename BlockReduce::storage_type temp_storage;
582 __syncthreads();
583 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalOr<int>());
584 return source;
585#else
586 return blockReduceLogicalOr(source);
587#endif
588}
589
590template <int BLOCKDIMX>
592void deviceReduceLogicalOr_full (int * dest, int source) noexcept
593{
594 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
595 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
596}
597
598template <typename T>
600void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept
601{
602 if (handler.isFullBlock()) {
603 if (blockDim.x == 128) {
604 deviceReduceSum_full<128,T>(dest, source);
605 } else if (blockDim.x == 256) {
606 deviceReduceSum_full<256,T>(dest, source);
607 } else {
608 deviceReduceSum_full<T>(dest, source);
609 }
610 } else {
611 Gpu::blockReduce_partial<Gpu::Device::warp_size>
613 Gpu::AtomicAdd<T>(), handler);
614 }
615}
616
617template <typename T>
619void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept
620{
621 if (handler.isFullBlock()) {
622 if (blockDim.x == 128) {
623 deviceReduceMin_full<128,T>(dest, source);
624 } else if (blockDim.x == 256) {
625 deviceReduceMin_full<256,T>(dest, source);
626 } else {
627 deviceReduceMin_full<T>(dest, source);
628 }
629 } else {
630 Gpu::blockReduce_partial<Gpu::Device::warp_size>
632 Gpu::AtomicMin<T>(), handler);
633 }
634}
635
636template <typename T>
638void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept
639{
640 if (handler.isFullBlock()) {
641 if (blockDim.x == 128) {
642 deviceReduceMax_full<128,T>(dest, source);
643 } else if (blockDim.x == 256) {
644 deviceReduceMax_full<256,T>(dest, source);
645 } else {
646 deviceReduceMax_full<T>(dest, source);
647 }
648 } else {
649 Gpu::blockReduce_partial<Gpu::Device::warp_size>
651 Gpu::AtomicMax<T>(), handler);
652 }
653}
654
656void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept
657{
658 if (handler.isFullBlock()) {
659 if (blockDim.x == 128) {
660 deviceReduceLogicalAnd_full<128>(dest, source);
661 } else if (blockDim.x == 256) {
662 deviceReduceLogicalAnd_full<256>(dest, source);
663 } else {
664 deviceReduceLogicalAnd_full(dest, source);
665 }
666 } else {
667 Gpu::blockReduce_partial<Gpu::Device::warp_size>
669 Gpu::AtomicLogicalAnd<int>(), handler);
670 }
671}
672
674void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept
675{
676 if (handler.isFullBlock()) {
677 if (blockDim.x == 128) {
678 deviceReduceLogicalOr_full<128>(dest, source);
679 } else if (blockDim.x == 256) {
680 deviceReduceLogicalOr_full<256>(dest, source);
681 } else {
682 deviceReduceLogicalOr_full(dest, source);
683 }
684 } else {
685 Gpu::blockReduce_partial<Gpu::Device::warp_size>
687 Gpu::AtomicLogicalOr<int>(), handler);
688 }
689}
690
691#else
692
693template <typename T>
695void deviceReduceSum_full (T * dest, T source) noexcept
696{
697#ifdef AMREX_USE_OMP
698#pragma omp atomic
699#endif
700 *dest += source;
701}
702
703template <typename T>
705void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept
706{
707 deviceReduceSum_full(dest, source);
708}
709
710template <typename T>
712void deviceReduceMin_full (T * dest, T source) noexcept
713{
714#ifdef AMREX_USE_OMP
715#pragma omp critical (gpureduce_reducemin)
716#endif
717 *dest = std::min(*dest, source);
718}
719
720template <typename T>
722void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept
723{
724 deviceReduceMin_full(dest, source);
725}
726
727template <typename T>
729void deviceReduceMax_full (T * dest, T source) noexcept
730{
731#ifdef AMREX_USE_OMP
732#pragma omp critical (gpureduce_reducemax)
733#endif
734 *dest = std::max(*dest, source);
735}
736
737template <typename T>
739void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept
740{
741 deviceReduceMax_full(dest, source);
742}
743
745void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
746{
747#ifdef AMREX_USE_OMP
748#pragma omp critical (gpureduce_reduceand)
749#endif
750 *dest = (*dest) && source;
751}
752
754void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept
755{
756 deviceReduceLogicalAnd_full(dest, source);
757}
758
760void deviceReduceLogicalOr_full (int * dest, int source) noexcept
761{
762#ifdef AMREX_USE_OMP
763#pragma omp critical (gpureduce_reduceor)
764#endif
765 *dest = (*dest) || source;
766}
767
769void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept
770{
771 deviceReduceLogicalOr_full(dest, source);
772}
773
774#endif
775}
776
777#endif
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:17
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
Definition AMReX_BaseFwd.H:55
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:600
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:310
__device__ void deviceReduceLogicalAnd_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:511
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:331
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:552
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:451
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:396
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:504
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:638
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:459
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:656
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:674
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:354
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:404
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:619
__device__ void deviceReduceLogicalOr_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:559
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:346
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_GpuAtomic.H:657
Definition AMReX_GpuAtomic.H:681
Definition AMReX_GpuAtomic.H:689
Definition AMReX_GpuAtomic.H:673
Definition AMReX_GpuAtomic.H:665
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:283
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:287
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14