Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_Reduce.H
Go to the documentation of this file.
1#ifndef AMREX_REDUCE_H_
2#define AMREX_REDUCE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Gpu.H>
6#include <AMReX_Arena.H>
7#include <AMReX_OpenMP.H>
8#include <AMReX_MFIter.H>
9#include <AMReX_TypeList.H>
10#include <AMReX_ValLocPair.H>
11
12#include <algorithm>
13#include <functional>
14#include <limits>
15
16namespace amrex {
17
18namespace Reduce {
19
20// The declaration of these functions are here to work around doxygen issues.
21
36template <typename T, typename N,
37 std::enable_if_t<std::is_integral_v<N>,int> = 0>
38T Sum (N n, T const* v, T init_val = 0);
39
56template <typename T, typename N, typename F,
57 std::enable_if_t<std::is_integral_v<N> &&
58 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
59T Sum (N n, F const& f, T init_val = 0);
60
75template <typename T, typename N,
76 std::enable_if_t<std::is_integral_v<N>,int> = 0>
77T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max());
78
95template <typename T, typename N, typename F,
96 std::enable_if_t<std::is_integral_v<N> &&
97 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
98T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max());
99
114template <typename T, typename N,
115 std::enable_if_t<std::is_integral_v<N>,int> = 0>
116T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest());
117
134template <typename T, typename N, typename F,
135 std::enable_if_t<std::is_integral_v<N> &&
136 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
137T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest());
138
152template <typename T, typename N,
153 std::enable_if_t<std::is_integral_v<N>,int> = 0>
154std::pair<T,T> MinMax (N n, T const* v);
155
171template <typename T, typename N, typename F,
172 std::enable_if_t<std::is_integral_v<N> &&
173 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
174std::pair<T,T> MinMax (N n, F const& f);
175
191template <typename T, typename N, typename P,
192 std::enable_if_t<std::is_integral_v<N>,int> = 0>
193bool AnyOf (N n, T const* v, P const& pred);
194
208template <typename P, int dim>
209bool AnyOf (BoxND<dim> const& box, P const& pred);
210
211}
212
214namespace Reduce::detail {
215
216#ifdef AMREX_USE_GPU
217#ifdef AMREX_USE_SYCL
218 template <std::size_t I, typename T, typename P>
220 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
221 {
222 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
223 }
224
225 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
227 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
228 {
229 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
230 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
231 }
232#else
233 template <std::size_t I, typename T, typename P>
235 void for_each_parallel (T& d, T const& s)
236 {
237 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
238 }
239
240 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
242 void for_each_parallel (T& d, T const& s)
243 {
244 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
245 for_each_parallel<I+1,T,P1,Ps...>(d, s);
246 }
247#endif
248#endif
249
250 template <std::size_t I, typename T, typename P>
252 void for_each_local (T& d, T const& s)
253 {
254 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
255 }
256
257 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
259 void for_each_local (T& d, T const& s)
260 {
261 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
262 for_each_local<I+1,T,P1,Ps...>(d, s);
263 }
264
265 template <std::size_t I, typename T, typename P>
267 constexpr void for_each_init (T& t)
268 {
269 P().init(amrex::get<I>(t));
270 }
271
272 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
274 constexpr void for_each_init (T& t)
275 {
276 P().init(amrex::get<I>(t));
277 for_each_init<I+1,T,P1,Ps...>(t);
278 }
279}
281
284{
285
286#ifdef AMREX_USE_GPU
287#ifdef AMREX_USE_SYCL
288 template <typename T>
290 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
291 T r = Gpu::blockReduceSum(s,h);
292 if (h.threadIdx() == 0) { d += r; }
293 }
294#else
295 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
297 void parallel_update (T& d, T const& s) const noexcept {
298 T r = Gpu::blockReduceSum<MT>(s);
299 if (threadIdx.x == 0) { d += r; }
300 }
301#endif
302#endif
303
304 template <typename T>
306 void local_update (T& d, T const& s) const noexcept { d += s; }
307
308 template <typename T>
309 constexpr void init (T& t) const noexcept { t = 0; }
310};
311
314{
315#ifdef AMREX_USE_GPU
316#ifdef AMREX_USE_SYCL
317 template <typename T>
319 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
320 T r = Gpu::blockReduceMin(s,h);
321 if (h.threadIdx() == 0) { d = amrex::min(d,r); }
322 }
323#else
324 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
326 void parallel_update (T& d, T const& s) const noexcept {
327 T r = Gpu::blockReduceMin<MT>(s);
328 if (threadIdx.x == 0) { d = amrex::min(d,r); }
329 }
330#endif
331#endif
332
333 template <typename T>
335 void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
336
337 template <typename T>
338 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
339 init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
340
341 template <typename T>
342 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
343 init (T& t) const noexcept { t = T::max(); }
344};
345
348{
349#ifdef AMREX_USE_GPU
350#ifdef AMREX_USE_SYCL
351 template <typename T>
353 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
354 T r = Gpu::blockReduceMax(s,h);
355 if (h.threadIdx() == 0) { d = amrex::max(d,r); }
356 }
357#else
358 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
360 void parallel_update (T& d, T const& s) const noexcept {
361 T r = Gpu::blockReduceMax<MT>(s);
362 if (threadIdx.x == 0) { d = amrex::max(d,r); }
363 }
364#endif
365#endif
366
367 template <typename T>
369 void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
370
371 template <typename T>
372 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
373 init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
374
375 template <typename T>
376 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
377 init (T& t) const noexcept { t = T::lowest(); }
378};
379
382{
383#ifdef AMREX_USE_GPU
384#ifdef AMREX_USE_SYCL
385 template <typename T>
387 std::enable_if_t<std::is_integral_v<T>>
388 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
390 if (h.threadIdx() == 0) { d = d && r; }
391 }
392#else
393 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
395 std::enable_if_t<std::is_integral_v<T>>
396 parallel_update (T& d, T s) const noexcept {
397 T r = Gpu::blockReduceLogicalAnd<MT>(s);
398 if (threadIdx.x == 0) { d = d && r; }
399 }
400#endif
401#endif
402
403 template <typename T>
405 std::enable_if_t<std::is_integral_v<T>>
406 local_update (T& d, T s) const noexcept { d = d && s; }
407
408 template <typename T>
409 constexpr std::enable_if_t<std::is_integral_v<T>>
410 init (T& t) const noexcept { t = true; }
411};
412
415{
416#ifdef AMREX_USE_GPU
417#ifdef AMREX_USE_SYCL
418 template <typename T>
420 std::enable_if_t<std::is_integral_v<T>>
421 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
422 T r = Gpu::blockReduceLogicalOr(s,h);
423 if (h.threadIdx() == 0) { d = d || r; }
424 }
425#else
426 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
428 std::enable_if_t<std::is_integral_v<T>>
429 parallel_update (T& d, T s) const noexcept {
430 T r = Gpu::blockReduceLogicalOr<MT>(s);
431 if (threadIdx.x == 0) { d = d || r; }
432 }
433#endif
434#endif
435
436 template <typename T>
438 std::enable_if_t<std::is_integral_v<T>>
439 local_update (T& d, T s) const noexcept { d = d || s; }
440
441 template <typename T>
442 constexpr std::enable_if_t<std::is_integral_v<T>>
443 init (T& t) const noexcept { t = false; }
444};
445
446template <typename... Ps> class ReduceOps;
447
448#ifdef AMREX_USE_GPU
449
451template <typename... Ts>
453{
454public:
455 using Type = GpuTuple<Ts...>;
456
457 template <typename... Ps>
458 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
459 : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
460 m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
461 m_device_tuple((Type*)(The_Arena()->alloc((AMREX_GPU_MAX_STREAMS)
462 * m_max_blocks * sizeof(Type)))),
463 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
464 {
465 reduce_op.resetResultReadiness();
466 static_assert(std::is_trivially_copyable<Type>(),
467 "ReduceData::Type must be trivially copyable");
468 static_assert(std::is_trivially_destructible<Type>(),
469 "ReduceData::Type must be trivially destructible");
470
471 new (m_host_tuple) Type();
472 m_nblocks.fill(0);
473 }
474
477 !m_used_external_stream || m_value_called,
478 "ReduceData used on an external GPU stream must call value() before destruction.");
479 The_Pinned_Arena()->free(m_host_tuple);
480 The_Arena()->free(m_device_tuple);
481 }
482
483 ReduceData (ReduceData<Ts...> const&) = delete;
485 void operator= (ReduceData<Ts...> const&) = delete;
486 void operator= (ReduceData<Ts...> &&) = delete;
487
489 {
490 Type r = m_fn_value();
491 m_value_called = true;
492 return r;
493 }
494
495 template <typename... Ps>
497 {
498 Type r = reduce_op.value(*this);
499 m_value_called = true;
500 return r;
501 }
502
503 Type* devicePtr () { return m_device_tuple; }
505 return m_device_tuple+streamIndexChecked(s)*m_max_blocks;
506 }
507
508 Type* hostPtr () { return m_host_tuple; }
509
511 int& nBlocks (gpuStream_t const& s) { return m_nblocks[streamIndexChecked(s)]; }
512
513 int maxBlocks () const { return m_max_blocks; }
514
515 int maxStreamIndex () const { return m_max_stream_index; }
517 m_max_stream_index = std::max(m_max_stream_index,streamIndexChecked(s));
518 }
519
520 void markValueCalled () noexcept { m_value_called = true; }
521
522private:
523 int streamIndexChecked (gpuStream_t const& s)
524 {
525 int const idx = Gpu::Device::streamIndex(s);
526 m_used_external_stream = m_used_external_stream || Gpu::Device::usingExternalStream();
527 if (idx == 0) {
528 if (m_stream_index_zero_set) {
529 AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stream_index_zero == s,
530 "ReduceData cannot be reused across different external GPU streams "
531 "or between an external GPU stream and AMReX stream 0.");
532 } else {
533 m_stream_index_zero = s;
534 m_stream_index_zero_set = true;
535 }
536 }
537 return idx;
538 }
539
540 int m_max_blocks;
541 int m_max_stream_index = 0;
542 Type* m_host_tuple = nullptr;
543 Type* m_device_tuple = nullptr;
544 GpuArray<int,AMREX_GPU_MAX_STREAMS> m_nblocks;
545 gpuStream_t m_stream_index_zero{};
546 bool m_stream_index_zero_set = false;
547 bool m_used_external_stream = false;
548 bool m_value_called = false;
549 std::function<Type()> m_fn_value;
550};
551
553namespace Reduce::detail {
554
555 // call_f_intvect_box
556
557 template <typename F, int dim>
559 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept ->
560 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
561 {
562 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
563 }
564
565 template <typename F, int dim>
567 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
568 decltype(f(BoxND<dim>(iv, iv, t)))
569 {
570 return f(BoxND<dim>(iv, iv, t));
571 }
572
573 // call_f_intvect_n
574 template <typename F, typename T, int dim>
576 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
577 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
578 {
579 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
580 }
581
582 // mf_call_f
583
584 struct iterate_box {};
585 struct iterate_box_comp {};
586
587 template <typename I, typename F, typename T, typename... Ps,
588 std::enable_if_t<std::is_same_v<iterate_box,I>,int> = 0>
590 void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
591 {
592 auto const& pr = f(ibox,i,j,k);
593 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
594 }
595
596 template <typename I, typename F, typename T, typename... Ps,
597 std::enable_if_t<std::is_same_v<iterate_box_comp,I>,int> = 0>
599 void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
600 {
601 for (int n = 0; n < ncomp; ++n) {
602 auto const& pr = f(ibox,i,j,k,n);
603 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
604 }
605 }
606}
608
610template <typename... Ps>
612{
613public:
614
616
617 // This is public for CUDA
618 template <typename I, typename MF, typename D, typename F>
619 void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
620 {
621 using ReduceTuple = typename D::Type;
622 const int nboxes = mf.local_size();
623 if (nboxes > 0) {
624 auto const& parforinfo = mf.getParForInfo(nghost);
625 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
626 AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
627 const int nblocks = nblocks_per_box * nboxes;
628 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
629
630 auto const& stream = Gpu::gpuStream();
631 auto pdst = reduce_data.devicePtr(stream);
632 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
633 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
634 int& nblocks_ref = reduce_data.nBlocks(stream);
635 auto old_nblocks = static_cast<unsigned int>(nblocks_ref);
636 nblocks_ref = amrex::max(nblocks_ref, nblocks_ec);
637 reduce_data.updateMaxStreamIndex(stream);
638
639#ifdef AMREX_USE_SYCL
640 // device reduce needs local(i.e., shared) memory
641 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
642 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
643 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
644 {
645 Dim1 blockIdx {gh.blockIdx()};
646 Dim1 threadIdx{gh.threadIdx()};
647#else
648 amrex::launch_global<AMREX_GPU_MAX_THREADS>
649 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
650 ([=] AMREX_GPU_DEVICE () noexcept
651 {
652#endif
653 ReduceTuple r;
654 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
655 ReduceTuple& dst = pdst[blockIdx.x];
656 if (threadIdx.x == 0 && blockIdx.x >= old_nblocks) {
657 dst = r;
658 }
659 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
660 int ibox = iblock / nblocks_per_box;
661 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
662
663 BoxIndexer const& indexer = dp_boxes[ibox];
664 if (icell < indexer.numPts()) {
665 auto [i, j, k] = indexer(icell);
666 Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
667 (f, ibox, i, j, k, ncomp, r);
668 }
669 }
670#ifdef AMREX_USE_SYCL
671 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
672#else
673 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
674#endif
675 });
676 }
677 }
678
679 // This is public for CUDA
680 template <typename I, int dim, typename D, typename F>
681 void eval_box (I, BoxND<dim> const& box, int ncomp, D& reduce_data, F const& f)
682 {
683 using ReduceTuple = typename D::Type;
684 auto const& stream = Gpu::gpuStream();
685 auto dp = reduce_data.devicePtr(stream);
686 int& nblocks = reduce_data.nBlocks(stream);
687 const BoxIndexerND<dim> indexer(box);
688 IndexTypeND<dim> ixtype = box.ixType();
689 constexpr int nitems_per_thread = 4;
690 Long nblocks_ec = (box.numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
691 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
692 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
693 reduce_data.updateMaxStreamIndex(stream);
694#ifdef AMREX_USE_SYCL
695 // device reduce needs local(i.e., shared) memory
696 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
697 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
698 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
699 {
700 Dim1 blockIdx {gh.blockIdx()};
701 Dim1 threadIdx{gh.threadIdx()};
702 Dim1 gridDim {gh.gridDim()};
703#else
704 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
705 [=] AMREX_GPU_DEVICE () noexcept
706 {
707#endif
708 ReduceTuple r;
709 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
710 ReduceTuple& dst = *(dp+blockIdx.x);
711 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
712 dst = r;
713 }
714 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
715 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
716 icell < indexer.numPts();
717 icell += stride)
718 {
719 auto iv = indexer.intVect(icell);
720 amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture
721 if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {
722 auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
723 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
724 } else {
725 for (int n = 0; n < ncomp; ++n) {
726 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
727 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
728 }
729 }
730 }
731#ifdef AMREX_USE_SYCL
732 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
733#else
734 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
735#endif
736 });
737 nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
738 }
739
741
742 template <typename MF, typename D, typename F>
743 std::enable_if_t<IsFabArray<MF>::value
744#ifndef AMREX_USE_CUDA
746#endif
747 >
748 eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
749 {
750 using ReduceTuple = typename D::Type;
751 const int nboxes = mf.local_size();
752 if (nboxes == 0) {
753 return;
754 } else if (!mf.isFusingCandidate()) {
755 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
756 Box const& b = amrex::grow(mfi.validbox(), nghost);
757 const int li = mfi.LocalIndex();
758 this->eval(b, reduce_data,
759 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
760 {
761 return f(li, i, j, k);
762 });
763 }
764 } else {
765 eval_mf(Reduce::detail::iterate_box{},
766 mf, nghost, 0, reduce_data, std::forward<F>(f));
767 }
768 }
769
770 template <typename MF, typename D, typename F>
771 std::enable_if_t<IsFabArray<MF>::value
772#ifndef AMREX_USE_CUDA
774#endif
775 >
776 eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
777 {
778 using ReduceTuple = typename D::Type;
779
780 const int nboxes = mf.local_size();
781
782 if (nboxes == 0) {
783 return;
784 } else if (!mf.isFusingCandidate()) {
785 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
786 Box const& b = amrex::grow(mfi.validbox(), nghost);
787 const int li = mfi.LocalIndex();
788 this->eval(b, ncomp, reduce_data,
789 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
790 {
791 return f(li, i, j, k, n);
792 });
793 }
794 } else {
795 eval_mf(Reduce::detail::iterate_box_comp{},
796 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
797 }
798 }
799
800 template <typename D, typename F, int dim>
801 void eval (BoxND<dim> const& box, D & reduce_data, F const& f)
802 {
803 eval_box(Reduce::detail::iterate_box{}, box, 0, reduce_data, f);
804 }
805
806 template <typename N, typename D, typename F, int dim,
807 typename M=std::enable_if_t<std::is_integral_v<N>> >
808 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
809 {
810 eval_box(Reduce::detail::iterate_box_comp{}, box, ncomp, reduce_data, f);
811 }
812
813 template <typename N, typename D, typename F,
814 typename M=std::enable_if_t<std::is_integral_v<N>> >
815 void eval (N n, D & reduce_data, F const& f)
816 {
817 if (n <= 0) { return; }
818 using ReduceTuple = typename D::Type;
819 auto const& stream = Gpu::gpuStream();
820 auto dp = reduce_data.devicePtr(stream);
821 int& nblocks = reduce_data.nBlocks(stream);
822 constexpr int nitems_per_thread = 4;
823 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
824 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
825 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
826 reduce_data.updateMaxStreamIndex(stream);
827#ifdef AMREX_USE_SYCL
828 // device reduce needs local(i.e., shared) memory
829 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
830 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
831 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
832 {
833 Dim1 blockIdx {gh.blockIdx()};
834 Dim1 threadIdx{gh.threadIdx()};
835 Dim1 gridDim {gh.gridDim()};
836#else
837 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
838 [=] AMREX_GPU_DEVICE () noexcept
839 {
840#endif
841 ReduceTuple r;
842 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
843 ReduceTuple& dst = *(dp+blockIdx.x);
844 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
845 dst = r;
846 }
847 for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
848 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
849 i < n;
850 i += stride)
851 {
852 auto pr = f(i);
853 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
854 }
855#ifdef AMREX_USE_SYCL
856 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
857#else
858 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
859#endif
860 });
861 nblocks = amrex::max(nblocks, nblocks_ec);
862 }
863
864 template <typename D>
865 typename D::Type value (D & reduce_data)
866 {
867 auto hp = reduce_data.hostPtr();
868
869 if (m_result_is_ready) {
870 reduce_data.markValueCalled();
871 return *hp;
872 }
873
874 using ReduceTuple = typename D::Type;
875 auto const& stream = Gpu::gpuStream();
876 auto dp = reduce_data.devicePtr();
877 auto const& nblocks = reduce_data.nBlocks();
878#if defined(AMREX_USE_SYCL)
879 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
880 const int N = nblocks[0];
881 if (N == 0) {
882 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
883 } else {
885 Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
886 Gpu::streamSynchronize();
887 for (int i = 1; i < N; ++i) {
888 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
889 }
890 *hp = tmp[0];
891 }
892 } else
893#endif
894 {
895 int maxblocks = reduce_data.maxBlocks();
896#ifdef AMREX_USE_SYCL
897 // device reduce needs local(i.e., shared) memory
898 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
899#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
900 // xxxxx SYCL todo: reduce bug workaround
902 auto presult = dtmp.data();
903#else
904 auto presult = hp;
905#endif
906 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
907 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
908 {
909 ReduceTuple r;
910 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
911 ReduceTuple dst = r;
912 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
913 auto dp_stream = dp+istream*maxblocks;
914 for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
915 i < nblocks[istream]; i += stride) {
916 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
917 }
918 }
919 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
920 if (gh.threadIdx() == 0) { *presult = dst; }
921 });
922#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
923 Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
924#endif
925#else
926 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
927 [=] AMREX_GPU_DEVICE () noexcept
928 {
929 ReduceTuple r;
930 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
931 ReduceTuple dst = r;
932 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
933 auto dp_stream = dp+istream*maxblocks;
934 for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
935 i < nblocks[istream]; i += stride) {
936 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
937 }
938 }
939 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
940 if (threadIdx.x == 0) { *hp = dst; }
941 });
942#endif
943 Gpu::streamSynchronize();
944 }
945
946 m_result_is_ready = true;
947 reduce_data.markValueCalled();
948 return *hp;
949 }
950
951private:
952 template <typename... T> friend class ReduceData;
953 bool m_result_is_ready = false;
954 void resetResultReadiness () { m_result_is_ready = false; }
955};
956
957namespace Reduce {
958
959template <typename T, typename N,
960 std::enable_if_t<std::is_integral_v<N>,int> FOO>
961T Sum (N n, T const* v, T init_val)
962{
963 ReduceOps<ReduceOpSum> reduce_op;
964 ReduceData<T> reduce_data(reduce_op);
965 using ReduceTuple = typename decltype(reduce_data)::Type;
966 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
967 ReduceTuple hv = reduce_data.value(reduce_op);
968 return amrex::get<0>(hv) + init_val;
969}
970
971template <typename T, typename N, typename F,
972 std::enable_if_t<std::is_integral_v<N> &&
973 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
974T Sum (N n, F const& f, T init_val)
975{
976 ReduceOps<ReduceOpSum> reduce_op;
977 ReduceData<T> reduce_data(reduce_op);
978 using ReduceTuple = typename decltype(reduce_data)::Type;
979 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
980 ReduceTuple hv = reduce_data.value(reduce_op);
981 return amrex::get<0>(hv) + init_val;
982}
983
984template <typename T, typename N,
985 std::enable_if_t<std::is_integral_v<N>,int> FOO>
986T Min (N n, T const* v, T init_val)
987{
988 ReduceOps<ReduceOpMin> reduce_op;
989 ReduceData<T> reduce_data(reduce_op);
990 using ReduceTuple = typename decltype(reduce_data)::Type;
991 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
992 ReduceTuple hv = reduce_data.value(reduce_op);
993 return std::min(amrex::get<0>(hv),init_val);
994}
995
996template <typename T, typename N, typename F,
997 std::enable_if_t<std::is_integral_v<N> &&
998 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
999T Min (N n, F const& f, T init_val)
1000{
1001 ReduceOps<ReduceOpMin> reduce_op;
1002 ReduceData<T> reduce_data(reduce_op);
1003 using ReduceTuple = typename decltype(reduce_data)::Type;
1004 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
1005 ReduceTuple hv = reduce_data.value(reduce_op);
1006 return std::min(amrex::get<0>(hv),init_val);
1007}
1008
1009template <typename T, typename N,
1010 std::enable_if_t<std::is_integral_v<N>,int> FOO>
1011T Max (N n, T const* v, T init_val)
1012{
1013 ReduceOps<ReduceOpMax> reduce_op;
1014 ReduceData<T> reduce_data(reduce_op);
1015 using ReduceTuple = typename decltype(reduce_data)::Type;
1016 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
1017 ReduceTuple hv = reduce_data.value(reduce_op);
1018 return std::max(amrex::get<0>(hv),init_val);
1019}
1020
1021template <typename T, typename N, typename F,
1022 std::enable_if_t<std::is_integral_v<N> &&
1023 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1024T Max (N n, F const& f, T init_val)
1025{
1026 ReduceOps<ReduceOpMax> reduce_op;
1027 ReduceData<T> reduce_data(reduce_op);
1028 using ReduceTuple = typename decltype(reduce_data)::Type;
1029 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
1030 ReduceTuple hv = reduce_data.value(reduce_op);
1031 return std::max(amrex::get<0>(hv),init_val);
1032}
1033
1034template <typename T, typename N,
1035 std::enable_if_t<std::is_integral_v<N>,int> FOO>
1036std::pair<T,T> MinMax (N n, T const* v)
1037{
1039 ReduceData<T,T> reduce_data(reduce_op);
1040 using ReduceTuple = typename decltype(reduce_data)::Type;
1041 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
1042 return {v[i],v[i]};
1043 });
1044 auto hv = reduce_data.value(reduce_op);
1045 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1046}
1047
1048template <typename T, typename N, typename F,
1049 std::enable_if_t<std::is_integral_v<N> &&
1050 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1051std::pair<T,T> MinMax (N n, F const& f)
1052{
1054 ReduceData<T,T> reduce_data(reduce_op);
1055 using ReduceTuple = typename decltype(reduce_data)::Type;
1056 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
1057 T tmp = f(i);
1058 return {tmp,tmp};
1059 });
1060 auto hv = reduce_data.value(reduce_op);
1061 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1062}
1063
1064template <typename T, typename N, typename P,
1065 std::enable_if_t<std::is_integral_v<N>,int> FOO>
1066bool AnyOf (N n, T const* v, P const& pred)
1067{
1068 Gpu::LaunchSafeGuard lsg(true);
1070 int* dp = ds.dataPtr();
1071 auto ec = Gpu::ExecutionConfig(n);
1072 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1073
1074#ifdef AMREX_USE_SYCL
1075 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1076 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
1077 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1078 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
1079 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
1080 if (gh.threadIdx() == 0) { *has_any = *dp; }
1081 gh.sharedBarrier();
1082
1083 if (!(*has_any))
1084 {
1085 int r = false;
1086 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
1087 i < n && !r; i += stride)
1088 {
1089 r = pred(v[i]) ? 1 : 0;
1090 }
1091
1092 r = Gpu::blockReduce<Gpu::Device::warp_size>
1093 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
1094 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1095 }
1096 });
1097#else
1098 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
1099 [=] AMREX_GPU_DEVICE () noexcept {
1100 __shared__ int has_any;
1101 if (threadIdx.x == 0) { has_any = *dp; }
1102 __syncthreads();
1103
1104 if (!has_any)
1105 {
1106 int r = false;
1107 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
1108 i < n && !r; i += stride)
1109 {
1110 r = pred(v[i]) ? 1 : 0;
1111 }
1112 r = Gpu::blockReduce<Gpu::Device::warp_size>
1113 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
1114 if (threadIdx.x == 0 && r) *dp = 1;
1115 }
1116 });
1117#endif
1118 return ds.dataValue();
1119}
1120
1121template <typename P, int dim>
1122bool AnyOf (BoxND<dim> const& box, P const& pred)
1123{
1124 Gpu::LaunchSafeGuard lsg(true);
1126 int* dp = ds.dataPtr();
1127 const BoxIndexerND<dim> indexer(box);
1128 auto ec = Gpu::ExecutionConfig(box.numPts());
1129 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1130
1131#ifdef AMREX_USE_SYCL
1132 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1133 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
1134 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1135 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
1136 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
1137 if (gh.threadIdx() == 0) { *has_any = *dp; }
1138 gh.sharedBarrier();
1139
1140 if (!(*has_any))
1141 {
1142 int r = false;
1143 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
1144 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
1145 icell < indexer.numPts() && !r;
1146 icell += stride)
1147 {
1148 auto iv = indexer.intVect(icell);
1149 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1150 }
1151 r = Gpu::blockReduce<Gpu::Device::warp_size>
1152 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
1153 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1154 }
1155 });
1156#else
1157 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
1158 Gpu::gpuStream(),
1159 [=] AMREX_GPU_DEVICE () noexcept {
1160 __shared__ int has_any;
1161 if (threadIdx.x == 0) { has_any = *dp; }
1162 __syncthreads();
1163
1164 if (!has_any)
1165 {
1166 int r = false;
1167 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
1168 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
1169 icell < indexer.numPts() && !r;
1170 icell += stride)
1171 {
1172 auto iv = indexer.intVect(icell);
1173 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1174 }
1175 r = Gpu::blockReduce<Gpu::Device::warp_size>
1176 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
1177 if (threadIdx.x == 0 && r) *dp = 1;
1178 }
1179 });
1180#endif
1181 return ds.dataValue();
1182}
1183
1184}
1185
1186#else
1187
1188template <typename... Ts>
1189class ReduceData
1190{
1191public:
1192 using Type = GpuTuple<Ts...>;
1193
1194 template <typename... Ps>
1195 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
1196 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1197 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1198 {
1199 reduce_op.resetResultReadiness();
1200 for (auto& t : m_tuple) {
1201 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1202 }
1203 }
1204
1205 ~ReduceData () = default;
1206 ReduceData (ReduceData<Ts...> const&) = delete;
1207 ReduceData (ReduceData<Ts...> &&) = delete;
1208 void operator= (ReduceData<Ts...> const&) = delete;
1209 void operator= (ReduceData<Ts...> &&) = delete;
1210
1211 Type value () { return m_fn_value(); }
1212
1213 template <typename... Ps>
1214 Type value (ReduceOps<Ps...>& reduce_op)
1215 {
1216 return reduce_op.value(*this);
1217 }
1218
1219 Vector<Type>& reference () { return m_tuple; }
1220
1221 Type& reference (int tid)
1222 {
1223 if (m_tuple.size() == 1) {
1224 // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1225 return m_tuple[0];
1226 } else {
1227 return m_tuple[tid];
1228 }
1229 }
1230
1231private:
1232 Vector<Type> m_tuple;
1233 std::function<Type()> m_fn_value;
1234};
1235
1236namespace Reduce::detail {
1237
1238 // call_f_intvect
1239
1240 template <typename F, int dim>
1242 auto call_f_intvect (F const& f, IntVectND<dim> iv) noexcept ->
1243 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
1244 {
1245 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
1246 }
1247
1248 // call_f_intvect_n
1249
1250 template <typename F, typename T, int dim>
1252 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
1253 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
1254 {
1255 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
1256 }
1257}
1258
1259template <typename... Ps>
1260class ReduceOps
1261{
1262private:
1263
1264 // call_f_box
1265
1266 template <typename D, typename F, int dim>
1268 static auto call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f)
1269 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(
1270 Reduce::detail::call_f_intvect(f, IntVectND<dim>(0))
1271 )>, typename D::Type>>
1272 {
1273 using ReduceTuple = typename D::Type;
1274 For(box,
1275 [&] (IntVectND<dim> iv) {
1276 auto pr = Reduce::detail::call_f_intvect(f, iv);
1277 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
1278 });
1279 }
1280
1281 template <typename D, typename F, int dim>
1283 static auto call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f)
1284 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(box))>,
1285 typename D::Type>>
1286 {
1287 using ReduceTuple = typename D::Type;
1288 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1289 }
1290
1291public:
1292
1293 template <typename MF, typename D, typename F>
1294 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1295 eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1296 {
1297 using ReduceTuple = typename D::Type;
1298#ifdef AMREX_USE_OMP
1299#pragma omp parallel
1300#endif
1301 {
1302 ReduceTuple rr;
1303 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1304 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1305 Box const& b = mfi.growntilebox(nghost);
1306 const int li = mfi.LocalIndex();
1307 const auto lo = amrex::lbound(b);
1308 const auto hi = amrex::ubound(b);
1309 for (int k = lo.z; k <= hi.z; ++k) {
1310 for (int j = lo.y; j <= hi.y; ++j) {
1311 for (int i = lo.x; i <= hi.x; ++i) {
1312 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1313 }}}
1314 }
1315 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1316 reduce_data.reference(OpenMP::get_thread_num()), rr);
1317 }
1318 }
1319
1320 template <typename MF, typename D, typename F>
1321 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1322 eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1323 {
1324 using ReduceTuple = typename D::Type;
1325#ifdef AMREX_USE_OMP
1326#pragma omp parallel
1327#endif
1328 {
1329 ReduceTuple rr;
1330 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1331 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1332 Box const& b = mfi.growntilebox(nghost);
1333 const int li = mfi.LocalIndex();
1334 const auto lo = amrex::lbound(b);
1335 const auto hi = amrex::ubound(b);
1336 for (int n = 0; n < ncomp; ++n) {
1337 for (int k = lo.z; k <= hi.z; ++k) {
1338 for (int j = lo.y; j <= hi.y; ++j) {
1339 for (int i = lo.x; i <= hi.x; ++i) {
1340 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1341 }}}}
1342 }
1343 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1344 reduce_data.reference(OpenMP::get_thread_num()), rr);
1345 }
1346 }
1347
1348 template <typename D, typename F, int dim>
1349 void eval (BoxND<dim> const& box, D & reduce_data, F&& f)
1350 {
1351 using ReduceTuple = typename D::Type;
1352 ReduceTuple rr;
1353 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1354 call_f_box<D>(box, rr, std::forward<F>(f));
1355 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1356 reduce_data.reference(OpenMP::get_thread_num()), rr);
1357 }
1358
1359 template <typename N, typename D, typename F, int dim,
1360 typename M=std::enable_if_t<std::is_integral_v<N>> >
1361 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
1362 {
1363 using ReduceTuple = typename D::Type;
1364 ReduceTuple rr;
1365 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1366 For(box, ncomp,
1367 [&] (IntVectND<dim> iv, int n) {
1368 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
1369 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);
1370 });
1371 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1372 reduce_data.reference(OpenMP::get_thread_num()), rr);
1373 }
1374
1375 template <typename N, typename D, typename F,
1376 typename M=std::enable_if_t<std::is_integral_v<N>> >
1377 void eval (N n, D & reduce_data, F const& f)
1378 {
1379 using ReduceTuple = typename D::Type;
1380 ReduceTuple rr;
1381 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1382 for (N i = 0; i < n; ++i) {
1383 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1384 }
1385 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1386 reduce_data.reference(OpenMP::get_thread_num()), rr);
1387 }
1388
1389 template <typename D>
1390 typename D::Type value (D & reduce_data)
1391 {
1392 auto& rrv = reduce_data.reference();
1393 if (! m_result_is_ready) {
1394 using ReduceTuple = typename D::Type;
1395 if (rrv.size() > 1) {
1396 for (int i = 1, N = rrv.size(); i < N; ++i) {
1397 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1398 }
1399 }
1400 m_result_is_ready = true;
1401 }
1402 return rrv[0];
1403 }
1404
1405private:
1406 template <typename... T> friend class ReduceData;
1407 bool m_result_is_ready = false;
1408 void resetResultReadiness () { m_result_is_ready = false; }
1409};
1410
1411namespace Reduce {
1412
1413template <typename T, typename N, typename F,
1414 std::enable_if_t<std::is_integral_v<N> &&
1415 !std::is_same_v<T*,std::decay_t<F>>,int> >
1416T Sum (N n, F const& f, T init_val)
1417{
1418 T r = init_val;
1419#ifdef AMREX_USE_OMP
1420#pragma omp parallel for reduction(+:r)
1421#endif
1422 for (N i = 0; i < n; ++i) {
1423 r += f(i);
1424 }
1425 return r;
1426}
1427
1428template <typename T, typename N,
1429 std::enable_if_t<std::is_integral_v<N>,int> >
1430T Sum (N n, T const* v, T init_val)
1431{
1432 return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1433}
1434
1435template <typename T, typename N, typename F,
1436 std::enable_if_t<std::is_integral_v<N> &&
1437 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1438T Min (N n, F const& f, T init_val)
1439{
1440 T r = init_val;
1441#ifdef AMREX_USE_OMP
1442#pragma omp parallel for reduction(min:r)
1443#endif
1444 for (N i = 0; i < n; ++i) {
1445 r = std::min(r,f(i));
1446 }
1447 return r;
1448}
1449
1450template <typename T, typename N,
1451 std::enable_if_t<std::is_integral_v<N>,int> >
1452T Min (N n, T const* v, T init_val)
1453{
1454 return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1455}
1456
1457template <typename T, typename N, typename F,
1458 std::enable_if_t<std::is_integral_v<N> &&
1459 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1460T Max (N n, F const& f, T init_val)
1461{
1462 T r = init_val;
1463#ifdef AMREX_USE_OMP
1464#pragma omp parallel for reduction(max:r)
1465#endif
1466 for (N i = 0; i < n; ++i) {
1467 r = std::max(r,f(i));
1468 }
1469 return r;
1470}
1471
1472template <typename T, typename N,
1473 std::enable_if_t<std::is_integral_v<N>,int> >
1474T Max (N n, T const* v, T init_val)
1475{
1476 return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1477}
1478
1479template <typename T, typename N, typename F,
1480 std::enable_if_t<std::is_integral_v<N> &&
1481 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1482std::pair<T,T> MinMax (N n, F const& f)
1483{
1484 T r_min = std::numeric_limits<T>::max();
1485 T r_max = std::numeric_limits<T>::lowest();
1486#ifdef AMREX_USE_OMP
1487#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1488#endif
1489 for (N i = 0; i < n; ++i) {
1490 T tmp = f(i);
1491 r_min = std::min(r_min,tmp);
1492 r_max = std::max(r_max,tmp);
1493 }
1494 return std::make_pair(r_min,r_max);
1495}
1496
1497template <typename T, typename N, std::enable_if_t<std::is_integral_v<N>,int>>
1498std::pair<T,T> MinMax (N n, T const* v)
1499{
1500 return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1501}
1502
1503template <typename T, typename N, typename P,
1504 std::enable_if_t<std::is_integral_v<N>,int> >
1505bool AnyOf (N n, T const* v, P&& pred)
1506{
1507 return std::any_of(v, v+n, std::forward<P>(pred));
1508}
1509
1510template <typename P, int dim>
1511bool AnyOf (BoxND<dim> const& box, P const& pred)
1512{
1513 for (auto iv : box.iterator()) { // NOLINT(readability-use-anyofallof)
1514 if (Reduce::detail::call_f_intvect(pred, iv)) { return true; }
1515 }
1516 return false;
1517}
1518
1519}
1520
1521#endif
1522
1527template <typename... Ts, typename... Ps>
1529constexpr GpuTuple<Ts...>
1531{
1532 GpuTuple<Ts...> r{};
1533 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1534 return r;
1535}
1536
1541template <typename... Ts, typename... Ps>
1543constexpr GpuTuple<Ts...>
1545{
1546 GpuTuple<Ts...> r{};
1547 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1548 return r;
1549}
1550
1552template <typename Ops, typename Ts>
1553class ReducerImpl;
1554
1555template <typename... Ops, typename... Ts>
1556class ReducerImpl<TypeList<Ops...>, TypeList<Ts...>>
1557{
1558public:
1559 static_assert(sizeof...(Ops) > 0);
1560 static_assert(sizeof...(Ts) > 0);
1561 static_assert(sizeof...(Ops) == sizeof...(Ts));
1562
1563 ReducerImpl ()
1564 : m_reduce_data(m_reduce_op)
1565 {}
1566
1567protected:
1568 using Result_t = GpuTuple<Ts...>;
1569 ReduceOps<Ops...> m_reduce_op;
1570 ReduceData<Ts...> m_reduce_data;
1571};
1573
1638template <typename Ops, typename Ts>
1640 : public ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>
1641{
1642 using Base = ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>;
1643public:
1644
1645 using Result_t = typename Base::Result_t;
1646 static constexpr int size = GpuTupleSize<Result_t>::value;
1647
1648 Reducer () = default;
1649 ~Reducer () = default;
1650
1652 Reducer (Reducer const&) = delete;
1653 Reducer (Reducer &&) = delete;
1654 void operator= (Reducer const&) = delete;
1655 void operator= (Reducer &&) = delete;
1657
1671 template <typename F, int dim>
1672 std::enable_if_t<IsCallable<F, int, int, int>::value ||
1674 eval (BoxND<dim> const& box, F&& f)
1675 {
1676 this->m_reduce_op.eval(box, this->m_reduce_data, std::forward<F>(f));
1677 }
1678
1694 template <typename F, int dim>
1695 std::enable_if_t<IsCallable<F, int, int, int, int>::value ||
1696 IsCallable<F, IntVectND<dim>, int>::value>
1697 eval (BoxND<dim> const& box, int ncomp, F&& f)
1698 {
1699 this->m_reduce_op.eval(box, ncomp, this->m_reduce_data, std::forward<F>(f));
1700 }
1701
1721 template <typename MF, typename F>
1722 std::enable_if_t<IsFabArray<MF>::value &&
1724 eval (MF const& mf, IntVect const& nghost, F && f)
1725 {
1726 this->m_reduce_op.eval(mf, nghost, this->m_reduce_data, std::forward<F>(f));
1727 }
1728
1751 template <typename MF, typename F>
1752 std::enable_if_t<IsFabArray<MF>::value &&
1754 eval (MF const& mf, IntVect const& nghost, int ncomp, F && f)
1755 {
1756 this->m_reduce_op.eval(mf, nghost, ncomp, this->m_reduce_data, std::forward<F>(f));
1757 }
1758
1771 template <typename N, typename F>
1772 std::enable_if_t<IsCallable<F, N>::value>
1773 eval (N n, F && f)
1774 {
1775 this->m_reduce_op.eval(n, this->m_reduce_data, std::forward<F>(f));
1776 }
1777
1788 [[nodiscard]] Result_t getResult ()
1789 {
1790 return this->m_reduce_data.value(this->m_reduce_op);
1791 }
1792};
1793
1794}
1795
1796#endif
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:36
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:135
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:710
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:837
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:88
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:172
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:453
~ReduceData()
Definition AMReX_Reduce.H:475
int maxStreamIndex() const
Definition AMReX_Reduce.H:515
Type value()
Definition AMReX_Reduce.H:488
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:516
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:511
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:458
void markValueCalled() noexcept
Definition AMReX_Reduce.H:520
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:504
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:496
Type * devicePtr()
Definition AMReX_Reduce.H:503
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:510
ReduceData(ReduceData< Ts... > const &)=delete
Type * hostPtr()
Definition AMReX_Reduce.H:508
int maxBlocks() const
Definition AMReX_Reduce.H:513
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:612
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:865
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:801
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:748
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:808
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:776
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:815
Class for local reductions (e.g., sum, min and max).
Definition AMReX_Reduce.H:1641
std::enable_if_t< IsCallable< F, int, int, int >::value||IsCallable< F, IntVectND< dim > >::value > eval(BoxND< dim > const &box, F &&f)
Reduction over a Box.
Definition AMReX_Reduce.H:1674
std::enable_if_t< IsFabArray< MF >::value &&IsCallable< F, int, int, int, int, int >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, F &&f)
Reduction over a MultiFab-like object.
Definition AMReX_Reduce.H:1754
Reducer()=default
std::enable_if_t< IsCallable< F, N >::value > eval(N n, F &&f)
Reduction over a 1D index range.
Definition AMReX_Reduce.H:1773
Result_t getResult()
Get the final reduction result.
Definition AMReX_Reduce.H:1788
typename Base::Result_t Result_t
Reduction result type, GpuTuple<U...>, where U... are the types in Ts.
Definition AMReX_Reduce.H:1645
~Reducer()=default
std::enable_if_t< IsFabArray< MF >::value &&IsCallable< F, int, int, int, int >::value > eval(MF const &mf, IntVect const &nghost, F &&f)
Reduction over a MultiFab-like object.
Definition AMReX_Reduce.H:1724
std::enable_if_t< IsCallable< F, int, int, int, int >::value||IsCallable< F, IntVectND< dim >, int >::value > eval(BoxND< dim > const &box, int ncomp, F &&f)
Reduction over a Box plus component index.
Definition AMReX_Reduce.H:1697
amrex_long Long
Definition AMReX_INT.H:30
T Min(N n, T const *v, T init_val=std::numeric_limits< T >::max())
Compute the minimum of an array of values.
Definition AMReX_Reduce.H:986
T Max(N n, T const *v, T init_val=std::numeric_limits< T >::lowest())
Compute the maximum of an array of values.
Definition AMReX_Reduce.H:1011
std::pair< T, T > MinMax(N n, T const *v)
Compute the minimum and maximum of an array of values.
Definition AMReX_Reduce.H:1036
bool AnyOf(N n, T const *v, P const &pred)
Test whether any element in an array satisfies a unary predicate.
Definition AMReX_Reduce.H:1066
T Sum(N n, T const *v, T init_val=0)
Compute the sum of an array of values.
Definition AMReX_Reduce.H:961
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1331
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1317
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1280
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:552
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:451
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:396
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:504
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:346
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1530
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
typename ToTypeList< T >::type ToTypeList_t
Definition AMReX_TypeList.H:233
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:43
Definition AMReX_Tuple.H:125
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:119
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:127
Definition AMReX_GpuReduce.H:283
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:382
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:410
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:396
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:406
Definition AMReX_Reduce.H:415
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:443
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:439
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:429
Definition AMReX_Reduce.H:348
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:373
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:377
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:369
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:360
Definition AMReX_Reduce.H:314
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:343
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:326
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:335
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:339
Definition AMReX_Reduce.H:284
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:297
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:306
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:309
Struct for holding types.
Definition AMReX_TypeList.H:13