Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_FBI.H
Go to the documentation of this file.
1#ifndef AMREX_FBI_H_
2#define AMREX_FBI_H_
3
4namespace amrex {
5
6template <class FAB>
7struct FabCopyTag {
8 FAB const* sfab;
10 IntVect offset; // sbox.smallEnd() - dbox.smallEnd()
11};
12
14 char const* p;
16};
17
19namespace detail {
20
21#ifdef AMREX_USE_GPU
22
23template <class T0, class T1>
24struct CellStore
25{
27 operator() (T0* d, T1 s) const noexcept
28 {
29 *d = static_cast<T0>(s);
30 }
31};
32
33template <class T0, class T1>
34struct CellAdd
35{
37 operator() (T0* d, T1 s) const noexcept
38 {
39 *d += static_cast<T0>(s);
40 }
41};
42
43template <class T0, class T1>
44struct CellAtomicAdd
45{
46 template<class U0=T0, std::enable_if_t<amrex::HasAtomicAdd<U0>::value,int> = 0>
48 operator() (U0* d, T1 s) const noexcept
49 {
50 Gpu::Atomic::AddNoRet(d, static_cast<U0>(s));
51 }
52};
53
54template <class T0, class T1, class F>
55void
56fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
57 F && f)
58{
59 TagVector<Array4CopyTag<T0, T1>> tv{copy_tags};
60
61 detail::ParallelFor_doit(tv,
63 int icell, int ncells, int i, int j, int k, Array4CopyTag<T0, T1> const& tag) noexcept
64 {
65 if (icell < ncells) {
66 for (int n = 0; n < ncomp; ++n) {
67 f(&(tag.dfab(i,j,k,n+dcomp)),
68 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
69 }
70 }
71 });
72}
73
74template <class TagType, class F>
75void
76fab_to_fab_store (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
77{
79 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
80 {
81 int m = Gpu::Atomic::Add(&(tag.mask(i,j,k)), 1);
82 if (m == 0) {
83 for (int n = 0; n < ncomp; ++n) {
84 f(&(tag.dfab(i,j,k,n+dcomp)),
85 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
86 }
87 }
88 });
89}
90
91template <class TagType, class F>
92void
93fab_to_fab_other (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
94{
96 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
97 {
98 int* m = &(tag.mask(i,j,k));
99 bool my_turn = false;
100 do {
101#if defined(AMREX_USE_SYCL)
102 my_turn = (Gpu::Atomic::Exch(m, 1) == 0);
103#else
104 my_turn = (Gpu::Atomic::CAS(m, 0, 1) == 0);
105#endif
106 if (my_turn) {
107#if defined(AMREX_USE_SYCL)
108 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
109#else
110 __threadfence();
111#endif
112 for (int n = 0; n < ncomp; ++n) {
113 f(&(tag.dfab(i,j,k,n+dcomp)),
114 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
115 }
116#if defined(AMREX_USE_SYCL)
117 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
118#else
119 __threadfence(); // It appears that we need this fence too if a GPU is shared.
120#endif
121 Gpu::Atomic::Exch(m, 0);
122 }
123 else {
124#if defined(AMREX_USE_CUDA)
125
126#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
127#if defined(_WIN32)
128 volatile int tmp; // prevent optimization
129 for (int c = 0; c < 2; ++c) {
130 ++tmp;
131 }
132#else
133 for (int c = 0; c < 2; ++c) {
134 __asm__ volatile(""); // prevent optimization
135 }
136#endif
137#else
138 __nanosleep(1);
139#endif
140
141#elif defined(AMREX_USE_HIP)
142
143 __builtin_amdgcn_s_sleep(1);
144
145#elif defined(AMREX_USE_SYCL)
146
147 for (int c = 0; c < 2; ++c) {
148 __asm__ volatile(""); // prevent optimization
149 }
150
151#endif
152 }
153 } while (!my_turn);
154 });
155}
156
157template <class T0, class T1, class F>
158void
159fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp,
160 int ncomp, F && f, Vector<Array4Tag<int> > const& masks)
161{
162 using TagType = Array4MaskCopyTag<T0, T1>;
163 Vector<TagType> tags;
164 const int N = copy_tags.size();
165 tags.reserve(N);
166 for (int i = 0; i < N; ++i) {
167 tags.push_back(TagType{.dfab = copy_tags[i].dfab, .sfab = copy_tags[i].sfab,
168 .mask = masks[i].dfab, .dbox = copy_tags[i].dbox,
169 .offset = copy_tags[i].offset});
170 }
171
172 if constexpr (std::is_same_v<F, CellStore<T0,T1>>)
173 {
174 fab_to_fab_store(tags, scomp, dcomp, ncomp, std::forward<F>(f));
175 }
176 else
177 {
178 fab_to_fab_other(tags, scomp, dcomp, ncomp, std::forward<F>(f));
179 }
180 // Note that Tag ParalleFor has a stream sync call in the end.
181}
182
183template <typename T0, typename T1,
184 std::enable_if_t<amrex::IsStoreAtomic<T0>::value,int> = 0>
185void
186fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
187 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
188{
189 fab_to_fab<T0, T1>(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>());
190}
191
192template <typename T0, typename T1,
193 std::enable_if_t<!amrex::IsStoreAtomic<T0>::value,int> = 0>
194void
195fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
196 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
197{
198 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
199}
200
201template <typename T0, typename T1,
202 std::enable_if_t<amrex::HasAtomicAdd<T0>::value,int> = 0>
203void
204fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
205 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
206{
207 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAtomicAdd<T0, T1>());
208}
209
210template <typename T0, typename T1,
211 std::enable_if_t<!amrex::HasAtomicAdd<T0>::value,int> = 0>
212void
213fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
214 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
215{
216 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
217}
218
219template <typename T0, typename T1, class F>
220void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int scomp,
221 int dcomp, int ncomp, F const& f)
222{
223 if (a_tags.empty()) { return; }
224
225 using TagType = Array4CopyTag<T0,T1>;
226
227 struct TiledTag {
228 int tag_index;
229 std::pair<int,Box> dindex_tilebox;
230 bool operator< (TiledTag const& rhs) const noexcept {
231 return this->dindex_tilebox < rhs.dindex_tilebox;
232 }
233 bool operator!= (TiledTag const& rhs) const noexcept {
234 return this->dindex_tilebox != rhs.dindex_tilebox;
235 }
236 };
237 Vector<TiledTag> tiled_tags;
238
239 auto const ixtype = a_tags[0].dbox.ixType();
240
241 constexpr int tile_size = 64;
242 for (int itag = 0; itag < a_tags.size(); ++itag) {
243 auto const& tag = a_tags[itag];
244 auto const& dlo = tag.dbox.smallEnd();
245 auto const& dhi = tag.dbox.bigEnd();
246 IntVect tlo(AMREX_D_DECL(amrex::coarsen<tile_size>(dlo[0]),
247 amrex::coarsen<tile_size>(dlo[1]),
248 amrex::coarsen<tile_size>(dlo[2])));
249 IntVect thi(AMREX_D_DECL(amrex::coarsen<tile_size>(dhi[0]),
250 amrex::coarsen<tile_size>(dhi[1]),
251 amrex::coarsen<tile_size>(dhi[2])));
252#if (AMREX_SPACEDIM == 3)
253 for (int kt = tlo[2]; kt <= thi[2]; ++kt)
254#endif
255 {
256#if (AMREX_SPACEDIM >= 2)
257 for (int jt = tlo[1]; jt <= thi[1]; ++jt)
258#endif
259 {
260 for (int it = tlo[0]; it <= thi[0]; ++it)
261 {
262 IntVect lo(AMREX_D_DECL(it*tile_size,
263 jt*tile_size,
264 kt*tile_size));
265 tiled_tags.push_back(TiledTag{
266 .tag_index = itag,
267 .dindex_tilebox = std::make_pair(tag.dindex, Box(lo, lo+(tile_size-1), ixtype))
268 });
269 }
270 }
271 }
272 }
273
274 std::sort(tiled_tags.begin(), tiled_tags.end());
275
276 Gpu::HostVector<unsigned int> h_ntags;
277 Gpu::HostVector<TagType> h_tags;
278 h_tags.reserve(tiled_tags.size());
279
280 for (unsigned int itag = 0; itag < tiled_tags.size(); ++itag) {
281 if (itag == 0) {
282 h_ntags.push_back(0);
283 } else if (tiled_tags[itag-1] != tiled_tags[itag]) {
284 h_ntags.push_back(itag);
285 }
286 auto const& ttag = tiled_tags[itag];
287 auto const& btag = a_tags[ttag.tag_index];
288 h_tags.push_back(TagType{.dfab = btag.dfab, .dindex = btag.dindex, .sfab = btag.sfab,
289 .dbox = btag.dbox & ttag.dindex_tilebox.second,
290 .offset = btag.offset});
291 }
292 h_ntags.push_back((unsigned int)tiled_tags.size());
293
294 Gpu::DeviceVector<TagType> d_tags(h_tags.size());
295 Gpu::DeviceVector<unsigned int> d_ntags(h_ntags.size());
296 Gpu::copyAsync(Gpu::hostToDevice,h_tags.begin(),h_tags.end(),d_tags.begin());
297 Gpu::copyAsync(Gpu::hostToDevice,h_ntags.begin(),h_ntags.end(),d_ntags.begin());
298 auto const* ptag = d_tags.data();
299 auto const* pntags = d_ntags.data();
300 auto const nblocks = int(h_ntags.size()-1);
301 constexpr auto nthreads = 256;
302 amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
303#ifdef AMREX_USE_SYCL
304 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
305 [[sycl::reqd_work_group_size(nthreads)]]
306#else
307 [=] AMREX_GPU_DEVICE () noexcept
308#endif
309 {
310#ifdef AMREX_USE_SYCL
311 Dim1 blockIdx{item.get_group_linear_id()};
312 Dim1 threadIdx{item.get_local_linear_id()};
313#endif
314
315 for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
316 auto const tag = ptag[itag];
317 auto ncells = int(tag.dbox.numPts());
318 const auto len = amrex::length(tag.dbox);
319 const auto lo = amrex::lbound(tag.dbox);
320 for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
321 int k = icell / (len.x*len.y);
322 int j = (icell - k*(len.x*len.y)) / len.x;
323 int i = (icell - k*(len.x*len.y)) - j*len.x;
324 i += lo.x;
325 j += lo.y;
326 k += lo.z;
327 for (int n = 0; n < ncomp; ++n) {
328 f(tag.dfab.ptr(i,j,k,n+dcomp),
329 tag.sfab(i + tag.offset.x,
330 j + tag.offset.y,
331 k + tag.offset.z, n+scomp));
332 }
333 }
334
335 if (itag+1 < pntags[blockIdx.x+1]) {
336#ifdef AMREX_USE_SYCL
337 sycl::group_barrier(item.get_group());
338#else
339 __syncthreads();
340#endif
341 }
342 }
343 });
345}
346
347template <typename B, typename V, typename TT,
348 std::enable_if_t<amrex::HasAtomicAdd<V>::value,int> FOO = 0>
349void unpack_recv_buffer_gpu_atomic_add (char* pbuffer, TagVector<TT> const& tv,
350 int dcomp, int ncomp)
351{
352 detail::ParallelFor_doit(tv,
353 [=] AMREX_GPU_DEVICE (
354 int icell, int ncells, int i, int j, int k, TT const& tag) noexcept
355 {
356 if (icell < ncells) {
357 Array4<B const> sfab{(B const*)(pbuffer+tag.poff),
358 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
359 for (int n = 0; n < ncomp; ++n) {
360 Gpu::Atomic::AddNoRet(tag.dfab.ptr(i,j,k,n+dcomp),
361 (V)sfab(i,j,k,n));
362 }
363 }
364 });
365}
366
367template <typename B, typename V, typename TT,
368 std::enable_if_t<!amrex::HasAtomicAdd<V>::value,int> FOO = 0>
369void unpack_recv_buffer_gpu_atomic_add (char* pbuffer, TagVector<TT> const& tv,
370 int dcomp, int ncomp)
371{
372 amrex::ignore_unused(pbuffer, tv, dcomp, ncomp);
373 amrex::Abort("unpack_recv_buffer_gpu: should NOT get here");
374}
375
376#endif /* AMREX_USE_GPU */
377
378}
380
381template <class FAB>
382void
383FabArray<FAB>::FB_local_copy_cpu (const FB& TheFB, int scomp, int ncomp)
384{
385 auto const& LocTags = *(TheFB.m_LocTags);
386 auto N_locs = static_cast<int>(LocTags.size());
387 if (N_locs == 0) { return; }
388 bool is_thread_safe = TheFB.m_threadsafe_loc;
389 if (is_thread_safe)
390 {
391#ifdef AMREX_USE_OMP
392#pragma omp parallel for
393#endif
394 for (int i = 0; i < N_locs; ++i)
395 {
396 const CopyComTag& tag = LocTags[i];
397
398 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
399 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
400
401 const FAB* sfab = &(get(tag.srcIndex));
402 FAB* dfab = &(get(tag.dstIndex));
403 dfab->template copy<RunOn::Host>(*sfab, tag.sbox, scomp, tag.dbox, scomp, ncomp);
404 }
405 }
406 else
407 {
409 for (int i = 0; i < N_locs; ++i)
410 {
411 const CopyComTag& tag = LocTags[i];
412
413 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
414 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
415
416 loc_copy_tags[tag.dstIndex].push_back
417 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
418 }
419#ifdef AMREX_USE_OMP
420#pragma omp parallel
421#endif
422 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
423 {
424 const auto& tags = loc_copy_tags[mfi];
425 auto dfab = this->array(mfi);
426 for (auto const & tag : tags)
427 {
428 auto const sfab = tag.sfab->array();
429 const auto offset = tag.offset.dim3();
430 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
431 [=] (int i, int j, int k, int n) noexcept
432 {
433 dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
434 });
435 }
436 }
437 }
438}
439
440template <class FAB>
441void
442FabArray<FAB>::FB_local_add_cpu (const FB& TheFB, int scomp, int ncomp)
443{
444 auto const& LocTags = *(TheFB.m_LocTags);
445 auto N_locs = static_cast<int>(LocTags.size());
446 if (N_locs == 0) { return; }
447
449 // We must make a temporary copy of the data first to avoid race condition.
450 std::vector<FAB> src_fabs(N_locs);
451 for (int itag = 0; itag < N_locs; ++itag) {
452 const CopyComTag& tag = LocTags[itag];
453 src_fabs[itag].resize(tag.sbox,ncomp);
454 loc_copy_tags[tag.dstIndex].push_back
455 (FabCopyTag<FAB>{&(src_fabs[itag]),
456 tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
457 }
458
459#ifdef AMREX_USE_OMP
460#pragma omp parallel for
461#endif
462 for (int itag = 0; itag < N_locs; ++itag) {
463 const CopyComTag& tag = LocTags[itag];
464 src_fabs[itag].template copy<RunOn::Host>(this->operator[](tag.srcIndex), scomp, 0, ncomp);
465 }
466
467#ifdef AMREX_USE_OMP
468#pragma omp parallel
469#endif
470 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
471 {
472 const auto& tags = loc_copy_tags[mfi];
473 const auto& dfab = this->array(mfi);
474 for (auto const & tag : tags)
475 {
476 auto const sfab = tag.sfab->array();
477 const auto offset = tag.offset.dim3();
478 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
479 [&] (int i, int j, int k, int n) noexcept
480 {
481 dfab(i,j,k,n+scomp) += sfab(i+offset.x,j+offset.y,k+offset.z,n);
482 });
483 }
484 }
485}
486
487#ifdef AMREX_USE_GPU
488
489template <class FAB>
492{
493 auto const& LocTags = *(TheFB.m_LocTags);
494 int N_locs = LocTags.size();
495
496 using TagType = Array4CopyTag<value_type>;
497
499 if (auto it = m_fb_local_copy_handler.find(TheFB.m_id);
500 it != m_fb_local_copy_handler.end())
501 {
502 tv = it->second.get();
503 } else {
504 Vector<TagType> loc_copy_tags;
505 loc_copy_tags.reserve(N_locs);
506
507 for (int i = 0; i < N_locs; ++i)
508 {
509 const CopyComTag& tag = LocTags[i];
510
511 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
512 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
513
514 int li = this->localindex(tag.dstIndex);
515 loc_copy_tags.push_back
516 (TagType{.dfab = this->atLocalIdx(li).array(),
517 .dindex = tag.dstIndex,
518 .sfab = this->fabPtr(tag.srcIndex)->const_array(),
519 .dbox = tag.dbox,
520 .offset = (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
521 }
522
523 auto utv = std::make_unique<TagVector<TagType>>(loc_copy_tags);
524 tv = utv.get();
525 m_fb_local_copy_handler[TheFB.m_id] = std::move(utv);
526 }
527
528 return tv;
529}
530
531template <class FAB>
532void
533FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
534{
535 auto const& LocTags = *(TheFB.m_LocTags);
536 int N_locs = LocTags.size();
537 if (N_locs == 0) { return; }
538 bool is_thread_safe = TheFB.m_threadsafe_loc;
539
540 using TagType = Array4CopyTag<value_type>;
541
542 if (is_thread_safe || amrex::IsStoreAtomic<value_type>::value)
543 {
544 auto* tv = FB_get_local_copy_tag_vector(TheFB);
545
546 detail::ParallelFor_doit(*tv,
547 [=] AMREX_GPU_DEVICE (
548 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
549 {
550 if (icell < ncells) {
551 for (int n = 0; n < ncomp; ++n) {
552 tag.dfab(i,j,k,n+scomp) = tag.sfab(i+tag.offset.x,
553 j+tag.offset.y,
554 k+tag.offset.z,n+scomp);
555 }
556 }
557 });
558 }
559 else
560 {
561 Vector<TagType> loc_copy_tags;
562 loc_copy_tags.reserve(N_locs);
563
564 Vector<BaseFab<int>> maskfabs(this->local_size());
565 Vector<Array4Tag<int> > masks_unique;
566 masks_unique.reserve(this->local_size());
567 Vector<Array4Tag<int> > masks;
568 masks.reserve(N_locs);
569
570 for (int i = 0; i < N_locs; ++i)
571 {
572 const CopyComTag& tag = LocTags[i];
573
574 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
575 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
576
577 int li = this->localindex(tag.dstIndex);
578 loc_copy_tags.push_back
579 (TagType{.dfab = this->atLocalIdx(li).array(),
580 .dindex = tag.dstIndex,
581 .sfab = this->fabPtr(tag.srcIndex)->const_array(),
582 .dbox = tag.dbox,
583 .offset = (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
584
585 if (!maskfabs[li].isAllocated()) {
586 maskfabs[li].resize(this->atLocalIdx(li).box());
587 masks_unique.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
588 }
589 masks.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
590 }
591
592 amrex::ParallelFor(masks_unique,
593 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
594 {
595 msk.dfab(i,j,k) = 0;
596 });
597
598 detail::fab_to_fab_atomic_cpy<value_type, value_type>(
599 loc_copy_tags, scomp, scomp, ncomp, masks);
600 }
601}
602
603template <class FAB>
604void
605FabArray<FAB>::FB_local_add_gpu (const FB& TheFB, int scomp, int ncomp, bool deterministic)
606{
607 auto const& LocTags = *(TheFB.m_LocTags);
608 int N_locs = LocTags.size();
609 if (N_locs == 0) { return; }
610
611 using TagType = Array4CopyTag<value_type>;
612
613 Vector<TagType> loc_copy_tags_1, loc_copy_tags_2;
614 loc_copy_tags_1.reserve(N_locs);
615 loc_copy_tags_2.reserve(N_locs);
616
617 Vector<FAB> src_fabs(N_locs);
618 for (int itag = 0; itag < N_locs; ++itag) {
619 const CopyComTag& tag = LocTags[itag];
620 src_fabs[itag].resize(tag.sbox,ncomp);
621 loc_copy_tags_1.push_back(
622 TagType{.dfab = src_fabs[itag].array(), .dindex = -1,
623 .sfab = this->const_array(tag.srcIndex,scomp), .dbox = tag.sbox,
624 .offset = Dim3{.x = 0, .y = 0, .z = 0}});
625 loc_copy_tags_2.push_back(
626 TagType{.dfab = this->array(tag.dstIndex,scomp), .dindex = tag.dstIndex,
627 .sfab = src_fabs[itag].const_array(), .dbox = tag.dbox,
628 .offset = (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
629 }
630
631 // Note that we have shifted the starting component to zero in the code above.
632
633 // TODO: We could try to cache the tags like in FillBoundary.
634
635 detail::fab_to_fab(loc_copy_tags_1, 0, 0, ncomp,
636 detail::CellStore<value_type, value_type>{});
637 if (deterministic || ! amrex::HasAtomicAdd<value_type>::value) {
638 detail::deterministic_fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
639 detail::CellAdd<value_type,value_type>{});
640 } else {
642 detail::fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
643 detail::CellAtomicAdd<value_type, value_type>{});
644 }
645 ((void)0);
646 }
647
648 // Note that fab_to_fab is synchronous.
649}
650
651template <class FAB>
652void
654 const CommMetaData& thecmd, int scomp, int ncomp)
655{
656 auto const& LocTags = *(thecmd.m_LocTags);
657 int N_locs = LocTags.size();
658 if (N_locs == 0) { return; }
659 bool is_thread_safe = thecmd.m_threadsafe_loc;
660
661 using TagType = Array4BoxTag<value_type>;
662 Vector<TagType> loc_setval_tags;
663 loc_setval_tags.reserve(N_locs);
664
666
667 for (int i = 0; i < N_locs; ++i)
668 {
669 const CopyComTag& tag = LocTags[i];
670 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
671 loc_setval_tags.push_back(TagType{.dfab = this->array(tag.dstIndex), .dbox = tag.dbox});
672 }
673
674 amrex::ParallelFor(loc_setval_tags, ncomp,
675 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
676 {
677 tag.dfab(i,j,k,n+scomp) = x;
678 });
679}
680
681template <class FAB>
682void
684 const CommMetaData& thecmd, int scomp, int ncomp)
685{
686 auto const& RcvTags = *(thecmd.m_RcvTags);
687 bool is_thread_safe = thecmd.m_threadsafe_rcv;
688
689 using TagType = Array4BoxTag<value_type>;
690 Vector<TagType> rcv_setval_tags;
691
692 for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
693 for (auto const& tag: it->second) {
694 rcv_setval_tags.push_back(TagType{.dfab = this->array(tag.dstIndex), .dbox = tag.dbox});
695 }
696 }
697
698 if (rcv_setval_tags.empty()) { return; }
699
701
702 amrex::ParallelFor(rcv_setval_tags, ncomp,
703 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
704 {
705 tag.dfab(i,j,k,n+scomp) = x;
706 });
707}
708
709#if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
710template <class FAB>
711void
712FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp)
713{
714 const int N_locs = (*TheFB.m_LocTags).size();
716 for (int i = 0; i < N_locs; ++i)
717 {
718 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
719
720 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
721 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
722
723 loc_copy_tags[tag.dstIndex].push_back
724 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
725 }
726
727 // Create Graph if one is needed.
728 if ( !(TheFB.m_localCopy.ready()) )
729 {
730 const_cast<FB&>(TheFB).m_localCopy.resize(N_locs);
731
732 int idx = 0;
733 // Record the graph.
734 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
735 {
736 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
737 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
738 (TheFB).m_localCopy.getDevicePtr(0),
739 std::size_t(sizeof(CopyMemory)*N_locs) );
740
741 const auto& tags = loc_copy_tags[mfi];
742 for (auto const & tag : tags)
743 {
744 const auto offset = tag.offset.dim3();
745 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
746 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
747 {
748 // Build the Array4's.
749 auto const dst = cmem->getDst<value_type>();
750 auto const src = cmem->getSrc<value_type>();
751 for (int n = 0; n < cmem->ncomp; ++n) {
752 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
753 }
754 });
755 }
756
757 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
758 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
759 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
760 }
761 }
762
763 // Setup Launch Parameters
764 // This is perfectly threadable, right?
765 // Additional optimization -> Check to see whether values need to be reset?
766 // Can then remove this setup and memcpy from CudaGraph::executeGraph.
767 int idx = 0;
768 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
769 {
770 auto const dst_array = this->array(mfi);
771 const auto& tags = loc_copy_tags[mfi];
772 for (auto const & tag : tags)
773 {
774 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
775 dst_array,
776 scomp, ncomp));
777 }
778 }
779
780 // Launch Graph
781 TheFB.m_localCopy.executeGraph();
782}
783
784#ifdef AMREX_USE_MPI
785template <class FAB>
786void
787FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp)
788{
789 const int N_locs = TheFB.m_LocTags->size();
790
791 int launches = 0; // Used for graphs only.
792 LayoutData<Vector<FabCopyTag<FAB> > > loc_copy_tags(boxArray(),DistributionMap());
793 for (int i = 0; i < N_locs; ++i)
794 {
795 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
796
797 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.dstIndex]));
798 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.srcIndex]));
799
800 if (distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc())
801 {
802 loc_copy_tags[tag.dstIndex].push_back
803 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
804 launches++;
805 }
806 }
807
808 FillBoundary_test();
809
810 if ( !(TheFB.m_localCopy.ready()) )
811 {
812 const_cast<FB&>(TheFB).m_localCopy.resize(launches);
813
814 int idx = 0;
815 int cuda_stream = 0;
816 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
817 {
818 const auto& tags = loc_copy_tags[mfi];
819 for (int t = 0; t<tags.size(); ++t)
820 {
821 Gpu::Device::setStreamIndex(cuda_stream++);
822 amrex::Gpu::Device::startGraphRecording( (idx == 0),
823 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
824 (TheFB).m_localCopy.getDevicePtr(0),
825 std::size_t(sizeof(CopyMemory)*launches) );
826
827 const auto& tag = tags[t];
828 const Dim3 offset = tag.offset.dim3();
829
830 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
831 AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
832 {
833 auto const dst = cmem->getDst<value_type>();
834 auto const src = cmem->getSrc<value_type>();
835 for (int n = 0; n < cmem->ncomp; ++n) {
836 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
837 }
838 });
839
840 bool last_iter = idx == launches;
841 cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
842 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
843 }
844 }
845 }
846
847 // Setup Launch Parameters
848 // This is perfectly threadable, right?
849 int idx = 0;
850 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
851 {
852 const auto& dst_array = this->array(mfi);
853 const auto& tags = loc_copy_tags[mfi];
854 for (auto const & tag : tags)
855 {
856 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
857 dst_array,
858 scomp, ncomp));
859 }
860 }
861
862 // Launch Graph without synch. Local work is entirely independent.
863 TheFB.m_localCopy.executeGraph(false);
864}
865#endif /* AMREX_USE_MPI */
866
867#endif /* __CUDACC__ */
868
869#endif /* AMREX_USE_GPU */
870
871#ifdef AMREX_USE_MPI
872
873#ifdef AMREX_USE_GPU
874
875#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
876
877template <class FAB>
878void
879FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int ncomp,
880 Vector<char*>& send_data,
881 Vector<std::size_t> const& send_size,
882 Vector<typename FabArray<FAB>::CopyComTagsContainer const*> const& send_cctc)
883{
884 const int N_snds = send_data.size();
885 if (N_snds == 0) { return; }
886
887 if ( !(TheFB.m_copyToBuffer.ready()) )
888 {
889 // Set size of CudaGraph buffer.
890 // Is the conditional ever expected false?
891 int launches = 0;
892 for (int send = 0; send < N_snds; ++send) {
893 if (send_size[send] > 0) {
894 launches += send_cctc[send]->size();
895 }
896 }
897 const_cast<FB&>(TheFB).m_copyToBuffer.resize(launches);
898
899 // Record the graph.
900 int idx = 0;
901 for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
902 sit.isValid(); ++sit)
903 {
904 amrex::Gpu::Device::startGraphRecording( (sit() == 0),
905 const_cast<FB&>(TheFB).m_copyToBuffer.getHostPtr(0),
906 (TheFB).m_copyToBuffer.getDevicePtr(0),
907 std::size_t(sizeof(CopyMemory)*launches) );
908
909 const int j = sit();
910 if (send_size[j] > 0)
911 {
912 auto const& cctc = *send_cctc[j];
913 for (auto const& tag : cctc)
914 {
915 const Box& bx = tag.sbox;
916 CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
917 AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
918 {
919 auto const pfab = cmem->getDst<value_type>();
920 auto const sfab = cmem->getSrc<value_type>();
921 for (int n = 0; n < cmem->ncomp; ++n)
922 {
923 pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
924 }
925 });
926 }
927 }
928
929 bool last_iter = sit() == (N_snds-1);
930 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
931 if (last_iter) { const_cast<FB&>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
932 }
933 }
934
935 // Setup Launch Parameters
936 int idx = 0;
937 for (int send = 0; send < N_snds; ++send)
938 {
939 const int j = send;
940 if (send_size[j] > 0)
941 {
942 char* dptr = send_data[j];
943 auto const& cctc = *send_cctc[j];
944 for (auto const& tag : cctc)
945 {
946 const_cast<FB&>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
947 amrex::makeArray4((value_type*)(dptr),
948 tag.sbox,
949 ncomp),
950 scomp, ncomp));
951
952 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
953 }
954 amrex::ignore_unused(send_size);
955 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
956 }
957 }
958
959 // Launch Graph synched, so copyToBuffer is complete prior to posting sends.
960 TheFB.m_copyToBuffer.executeGraph();
961}
962
963template <class FAB>
964void
965FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int ncomp,
966 Vector<char*> const& recv_data,
967 Vector<std::size_t> const& recv_size,
968 Vector<CopyComTagsContainer const*> const& recv_cctc,
969 bool /*is_thread_safe*/)
970{
971 const int N_rcvs = recv_cctc.size();
972 if (N_rcvs == 0) { return; }
973
974 int launches = 0;
975 LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
976 for (int k = 0; k < N_rcvs; ++k)
977 {
978 if (recv_size[k] > 0)
979 {
980 const char* dptr = recv_data[k];
981 auto const& cctc = *recv_cctc[k];
982 for (auto const& tag : cctc)
983 {
984 recv_copy_tags[tag.dstIndex].push_back(VoidCopyTag{.p = dptr, .dbox = tag.dbox});
985 dptr += tag.dbox.numPts() * ncomp * sizeof(value_type);
986 launches++;
987 }
988 amrex::ignore_unused(recv_size);
989 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
990 }
991 }
992
993 if ( !(TheFB.m_copyFromBuffer.ready()) )
994 {
995 const_cast<FB&>(TheFB).m_copyFromBuffer.resize(launches);
996
997 int idx = 0;
998 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
999 {
1000 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
1001 const_cast<FB&>(TheFB).m_copyFromBuffer.getHostPtr(0),
1002 (TheFB).m_copyFromBuffer.getDevicePtr(0),
1003 std::size_t(sizeof(CopyMemory)*launches) );
1004
1005 const auto& tags = recv_copy_tags[mfi];
1006 for (auto const & tag : tags)
1007 {
1008 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
1009 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
1010 {
1011 auto const pfab = cmem->getSrc<value_type>();
1012 auto const dfab = cmem->getDst<value_type>();
1013 for (int n = 0; n < cmem->ncomp; ++n)
1014 {
1015 dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
1016 }
1017 });
1018 }
1019
1020 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
1021 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
1022 if (last_iter) { const_cast<FB&>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
1023 }
1024 }
1025
1026 // Setup graph.
1027 int idx = 0;
1028 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
1029 {
1030 auto dst_array = this->array(mfi);
1031 const auto & tags = recv_copy_tags[mfi];
1032 for (auto const & tag : tags)
1033 {
1034 const_cast<FB&>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(amrex::makeArray4((value_type*)(tag.p),
1035 tag.dbox,
1036 ncomp),
1037 dst_array,
1038 dcomp, ncomp));
1039 }
1040 }
1041
1042 // Launch Graph - synced because next action is freeing recv buffer.
1043 TheFB.m_copyFromBuffer.executeGraph();
1044}
1045
1046#endif /* __CUDACC__ */
1047
1048template <class FAB>
1049template <typename BUF>
1050auto
1052 Vector<std::size_t> const& send_size,
1053 Vector<CopyComTagsContainer const*> const& send_cctc,
1054 int ncomp, std::uint64_t id) const
1056{
1057 using TagType = CommSendBufTag<value_type>;
1058
1059 auto kit = std::find_if(send_cctc.begin(), send_cctc.end(),
1060 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1061 if (kit == send_cctc.end()) {
1062 return nullptr;
1063 }
1064
1065 auto get_tags = [&] () -> Vector<TagType>
1066 {
1067 Vector<TagType> snd_copy_tags;
1068 char* pbuf = send_data[0];
1069 const int N_snds = send_data.size();
1070 for (int j = 0; j < N_snds; ++j)
1071 {
1072 if (send_size[j] > 0)
1073 {
1074 char* dptr = send_data[j];
1075 auto const& cctc = *send_cctc[j];
1076 for (auto const& tag : cctc)
1077 {
1078 snd_copy_tags.emplace_back
1079 (TagType{.sfab = this->const_array(tag.srcIndex), .poff = dptr-pbuf, .bx = tag.sbox});
1080 dptr += (tag.sbox.numPts() * ncomp * sizeof(BUF));
1081 }
1082 }
1083 }
1084 return snd_copy_tags;
1085 };
1086
1088 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1089
1090 if (auto it = m_send_copy_handler.find(key); it != m_send_copy_handler.end()) {
1091 tv = it->second.get();
1092 } else {
1093 if (m_send_copy_handler.size() > 32) {
1094 // Just in case. If this is used in ParallelCopy, it's possible
1095 // that the sending FabArray is the same, but the receiving
1096 // FabArray is different every time. Then the size of this map
1097 // could increase indefinitely.
1098 m_send_copy_handler.clear();
1099 }
1100 auto snd_copy_tags = get_tags();
1101 auto utv = std::make_unique<TagVector<TagType>>(snd_copy_tags);
1102 tv = utv.get();
1103 m_send_copy_handler[key] = std::move(utv);
1104 }
1105
1106 return tv;
1107}
1108
1109template <class FAB>
1110template <typename BUF>
1111void
1112FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
1113 Vector<char*> const& send_data,
1114 Vector<std::size_t> const& send_size,
1115 Vector<CopyComTagsContainer const*> const& send_cctc,
1116 std::uint64_t id)
1117{
1118 const int N_snds = send_data.size();
1119 if (N_snds == 0) { return; }
1120
1121 using TagType = CommSendBufTag<value_type>;
1122
1123 auto* tv = src.template get_send_copy_tag_vector<BUF>
1124 (send_data, send_size, send_cctc, ncomp, id);
1125 if (tv == nullptr) { return; }
1126
1127 char* pbuffer = send_data[0];
1128
1129 detail::ParallelFor_doit(*tv,
1130 [=] AMREX_GPU_DEVICE (
1131 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1132 {
1133 if (icell < ncells) {
1134 Array4<BUF> dfab{(BUF*)(pbuffer+tag.poff),
1135 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1136 for (int n = 0; n < ncomp; ++n) {
1137 dfab(i,j,k,n) = (BUF)tag.sfab(i,j,k,n+scomp);
1138 }
1139 }
1140 });
1141
1142 Gpu::streamSynchronize();
1143}
1144
1145template <class FAB>
1146template <typename BUF>
1147auto
1149 Vector<std::size_t> const& recv_size,
1150 Vector<CopyComTagsContainer const*> const& recv_cctc,
1151 int ncomp, std::uint64_t id)
1153{
1154 using TagType = CommRecvBufTag<value_type>;
1155
1156 auto kit = std::find_if(recv_cctc.begin(), recv_cctc.end(),
1157 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1158 if (kit == recv_cctc.end()) {
1159 return nullptr;
1160 }
1161
1162 auto get_tags = [&] () -> Vector<TagType>
1163 {
1164 Vector<TagType> recv_copy_tags;
1165 char* pbuf = recv_data[0];
1166 const int N_rcvs = recv_cctc.size();
1167 for (int k = 0; k < N_rcvs; ++k)
1168 {
1169 if (recv_size[k] > 0)
1170 {
1171 char* dptr = recv_data[k];
1172 auto const& cctc = *recv_cctc[k];
1173 for (auto const& tag : cctc)
1174 {
1175 const int li = this->localindex(tag.dstIndex);
1176 recv_copy_tags.emplace_back
1177 (TagType{.dfab = this->atLocalIdx(li).array(), .poff = dptr-pbuf, .bx = tag.dbox});
1178 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1179 }
1180 }
1181 }
1182 return recv_copy_tags;
1183 };
1184
1186 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1187
1188 if (auto it = m_recv_copy_handler.find(key); it != m_recv_copy_handler.end()) {
1189 tv = it->second.get();
1190 } else {
1191 if (m_recv_copy_handler.size() > 32) {
1192 // Just in case. If this is used in ParallelCopy, it's possible
1193 // that the receiving FabArray is the same, but the sending
1194 // FabArray is different every time. Then the size of this map
1195 // could increase indefinitely.
1196 m_recv_copy_handler.clear();
1197 }
1198 auto recv_copy_tags = get_tags();
1199 auto utv = std::make_unique<TagVector<TagType>>(recv_copy_tags);
1200 tv = utv.get();
1201 m_recv_copy_handler[key] = std::move(utv);
1202 }
1203
1204 return tv;
1205}
1206
1207template <class FAB>
1208template <typename BUF>
1209void
1211 Vector<char*> const& recv_data,
1212 Vector<std::size_t> const& recv_size,
1213 Vector<CopyComTagsContainer const*> const& recv_cctc,
1214 CpOp op, bool is_thread_safe, std::uint64_t id,
1215 bool deterministic)
1216{
1217 const int N_rcvs = recv_cctc.size();
1218 if (N_rcvs == 0) { return; }
1219
1220 bool use_mask = false;
1221 if (!is_thread_safe)
1222 {
1223 if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
1224 (op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))
1225 {
1226 use_mask = true;
1227 }
1228 }
1229
1230 if (deterministic)
1231 {
1232 AMREX_ALWAYS_ASSERT(op == FabArrayBase::ADD); // Only ADD for now
1233 using TagType = Array4CopyTag<value_type,BUF>;
1234 Vector<TagType> tags;
1235 tags.reserve(N_rcvs);
1236 for (int k = 0; k < N_rcvs; ++k) {
1237 if (recv_size[k] > 0) {
1238 char const* dptr = recv_data[k];
1239 auto const& cctc = *recv_cctc[k];
1240 for (auto const& tag : cctc) {
1241 tags.emplace_back(
1242 TagType{.dfab = dst.array(tag.dstIndex), .dindex = tag.dstIndex,
1243 .sfab = Array4<BUF const>((BUF const*)dptr,
1244 amrex::begin(tag.dbox),
1245 amrex::end(tag.dbox), ncomp),
1246 .dbox = tag.dbox,
1247 .offset = Dim3{.x = 0, .y = 0, .z = 0}});
1248 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1249 }
1250 }
1251 }
1253 detail::deterministic_fab_to_fab<value_type,BUF>
1254 (tags, 0, dcomp, ncomp, detail::CellAdd<value_type,BUF>{});
1255 } else {
1256 amrex::Abort("SumBoundary requires operator+=");
1257 }
1258 }
1259 else if (!use_mask)
1260 {
1261 using TagType = CommRecvBufTag<value_type>;
1262 auto* tv = dst.template get_recv_copy_tag_vector<BUF>
1263 (recv_data, recv_size, recv_cctc, ncomp, id);
1264 if (tv == nullptr) { return; }
1265
1266 char* pbuffer = recv_data[0];
1267
1268 if (op == FabArrayBase::COPY)
1269 {
1270 detail::ParallelFor_doit(*tv,
1271 [=] AMREX_GPU_DEVICE (
1272 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1273 {
1274 if (icell < ncells) {
1275 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1276 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1277 for (int n = 0; n < ncomp; ++n) {
1278 tag.dfab(i,j,k,n+dcomp) = (value_type)sfab(i,j,k,n);
1279 }
1280 }
1281 });
1282 }
1283 else
1284 {
1285 if (is_thread_safe) {
1286 detail::ParallelFor_doit(*tv,
1287 [=] AMREX_GPU_DEVICE (
1288 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1289 {
1290 if (icell < ncells) {
1291 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1292 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1293 for (int n = 0; n < ncomp; ++n) {
1294 tag.dfab(i,j,k,n+dcomp) += (value_type)sfab(i,j,k,n);
1295 }
1296 }
1297 });
1298 } else {
1299 detail::unpack_recv_buffer_gpu_atomic_add<BUF, value_type>
1300 (pbuffer, *tv, dcomp, ncomp);
1301 }
1302 }
1303 Gpu::streamSynchronize();
1304 }
1305 else
1306 {
1307 char* pbuffer = recv_data[0];
1308
1309 using TagType = Array4CopyTag<value_type, BUF>;
1310 Vector<TagType> recv_copy_tags;
1311 recv_copy_tags.reserve(N_rcvs);
1312
1313 Vector<BaseFab<int> > maskfabs(dst.local_size());
1314 Vector<Array4Tag<int> > masks_unique;
1315 masks_unique.reserve(dst.local_size());
1316 Vector<Array4Tag<int> > masks;
1317
1318 for (int k = 0; k < N_rcvs; ++k)
1319 {
1320 if (recv_size[k] > 0)
1321 {
1322 std::size_t offset = recv_data[k]-recv_data[0];
1323 const char* dptr = pbuffer + offset;
1324 auto const& cctc = *recv_cctc[k];
1325 for (auto const& tag : cctc)
1326 {
1327 const int li = dst.localindex(tag.dstIndex);
1328 recv_copy_tags.emplace_back(TagType{
1329 .dfab = dst.atLocalIdx(li).array(), .dindex = tag.dstIndex,
1330 .sfab = amrex::makeArray4((BUF const*)(dptr), tag.dbox, ncomp),
1331 .dbox = tag.dbox,
1332 .offset = Dim3{.x = 0, .y = 0, .z = 0}
1333 });
1334 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1335
1336 if (!maskfabs[li].isAllocated()) {
1337 maskfabs[li].resize(dst.atLocalIdx(li).box());
1338 masks_unique.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
1339 }
1340 masks.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
1341 }
1342 BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
1343 }
1344 }
1345
1346 amrex::ParallelFor(masks_unique,
1347 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
1348 {
1349 msk.dfab(i,j,k) = 0;
1350 });
1351
1352 if (op == FabArrayBase::COPY)
1353 {
1354 detail::fab_to_fab_atomic_cpy<value_type, BUF>(
1355 recv_copy_tags, 0, dcomp, ncomp, masks);
1356 }
1357 else
1358 {
1359 detail::fab_to_fab_atomic_add<value_type, BUF>(
1360 recv_copy_tags, 0, dcomp, ncomp, masks);
1361 }
1362
1363 // There is Gpu::streamSynchronize in fab_to_fab.
1364 }
1367#endif /* AMREX_USE_GPU */
1368
1369template <class FAB>
1370template <typename BUF>
1371void
1372FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
1373 Vector<char*> const& send_data,
1374 Vector<std::size_t> const& send_size,
1375 Vector<CopyComTagsContainer const*> const& send_cctc)
1376{
1377 amrex::ignore_unused(send_size);
1378
1379 auto const N_snds = static_cast<int>(send_data.size());
1380 if (N_snds == 0) { return; }
1381
1382#ifdef AMREX_USE_OMP
1383#pragma omp parallel for
1384#endif
1385 for (int j = 0; j < N_snds; ++j)
1386 {
1387 if (send_size[j] > 0)
1388 {
1389 char* dptr = send_data[j];
1390 auto const& cctc = *send_cctc[j];
1391 for (auto const& tag : cctc)
1392 {
1393 const Box& bx = tag.sbox;
1394 auto const sfab = src.array(tag.srcIndex);
1395 auto pfab = amrex::makeArray4((BUF*)(dptr),bx,ncomp);
1396 amrex::LoopConcurrentOnCpu( bx, ncomp,
1397 [=] (int ii, int jj, int kk, int n) noexcept
1398 {
1399 pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
1400 });
1401 dptr += (bx.numPts() * ncomp * sizeof(BUF));
1402 }
1403 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
1404 }
1405 }
1406}
1408template <class FAB>
1409template <typename BUF>
1410void
1412 Vector<char*> const& recv_data,
1413 Vector<std::size_t> const& recv_size,
1414 Vector<CopyComTagsContainer const*> const& recv_cctc,
1415 CpOp op, bool is_thread_safe)
1416{
1417 amrex::ignore_unused(recv_size);
1418
1419 auto const N_rcvs = static_cast<int>(recv_cctc.size());
1420 if (N_rcvs == 0) { return; }
1421
1422 if (is_thread_safe)
1423 {
1424#ifdef AMREX_USE_OMP
1425#pragma omp parallel for
1426#endif
1427 for (int k = 0; k < N_rcvs; ++k)
1428 {
1429 if (recv_size[k] > 0)
1430 {
1431 const char* dptr = recv_data[k];
1432 auto const& cctc = *recv_cctc[k];
1433 for (auto const& tag : cctc)
1434 {
1435 const Box& bx = tag.dbox;
1436 FAB& dfab = dst[tag.dstIndex];
1437 if (op == FabArrayBase::COPY)
1438 {
1439 dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
1440 }
1441 else
1442 {
1443 dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
1444 }
1445 dptr += bx.numPts() * ncomp * sizeof(BUF);
1446 }
1447 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1448 }
1449 }
1450 }
1451 else
1452 {
1453 LayoutData<Vector<VoidCopyTag> > recv_copy_tags;
1454 recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
1455 for (int k = 0; k < N_rcvs; ++k)
1456 {
1457 if (recv_size[k] > 0)
1458 {
1459 const char* dptr = recv_data[k];
1460 auto const& cctc = *recv_cctc[k];
1461 for (auto const& tag : cctc)
1462 {
1463 recv_copy_tags[tag.dstIndex].push_back(VoidCopyTag{.p = dptr, .dbox = tag.dbox});
1464 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1465 }
1466 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1467 }
1468 }
1469
1470#ifdef AMREX_USE_OMP
1471#pragma omp parallel
1472#endif
1473 for (MFIter mfi(dst); mfi.isValid(); ++mfi)
1474 {
1475 const auto& tags = recv_copy_tags[mfi];
1476 auto dfab = dst.array(mfi);
1477 for (auto const & tag : tags)
1478 {
1479 auto pfab = amrex::makeArray4((BUF*)(tag.p), tag.dbox, ncomp);
1480 if (op == FabArrayBase::COPY)
1481 {
1482 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1483 [=] (int i, int j, int k, int n) noexcept
1484 {
1485 dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1486 });
1487 }
1488 else
1489 {
1490 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1491 [=] (int i, int j, int k, int n) noexcept
1492 {
1493 dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
1494 });
1495 }
1496 }
1497 }
1498 }
1499}
1500
1501#endif /* AMREX_USE_MPI */
1502
1503}
1504
1505#endif
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition AMReX_GpuLaunchMacrosC.nolint.H:106
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:111
CopyComTag::CopyComTagsContainer CopyComTagsContainer
Definition AMReX_FabArrayBase.H:220
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition AMReX_FabArrayBase.H:119
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:131
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition AMReX_FabArrayBase.H:113
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:394
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:95
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:351
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition AMReX_FabArray.H:362
void CMD_remote_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:683
void FB_local_add_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:442
void FB_local_add_gpu(const FB &TheFB, int scomp, int ncomp, bool deterministic)
Definition AMReX_FBI.H:605
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:569
void FB_local_copy_gpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:533
void CMD_local_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:653
void FB_local_copy_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:383
TagVector< Array4CopyTag< value_type > > const * FB_get_local_copy_tag_vector(const FB &TheFB)
Definition AMReX_FBI.H:490
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition AMReX_FabArray.H:539
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:88
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:172
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1317
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:487
__host__ __device__ AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition AMReX_GpuAtomic.H:513
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:200
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition AMReX_BaseFab.H:92
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2867
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:241
const int[]
Definition AMReX_BLProfiler.cpp:1664
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2862
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1334
Definition AMReX_TagParallelFor.H:58
Definition AMReX_TagParallelFor.H:26
Definition AMReX_TagParallelFor.H:50
Array4< T > dfab
Definition AMReX_TagParallelFor.H:51
A multidimensional array accessor.
Definition AMReX_Array4.H:283
Definition AMReX_TagParallelFor.H:106
Definition AMReX_TagParallelFor.H:116
Definition AMReX_Dim3.H:12
int x
Definition AMReX_Dim3.H:12
Definition AMReX_FabArrayBase.H:472
bool m_threadsafe_loc
Definition AMReX_FabArrayBase.H:474
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:475
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:478
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:476
Used by a bunch of routines when communicating via MPI.
Definition AMReX_FabArrayBase.H:195
Box sbox
Definition AMReX_FabArrayBase.H:197
int srcIndex
Definition AMReX_FabArrayBase.H:199
Box dbox
Definition AMReX_FabArrayBase.H:196
int dstIndex
Definition AMReX_FabArrayBase.H:198
FillBoundary.
Definition AMReX_FabArrayBase.H:488
Definition AMReX_FBI.H:7
IntVect offset
Definition AMReX_FBI.H:10
Box dbox
Definition AMReX_FBI.H:9
FAB const * sfab
Definition AMReX_FBI.H:8
Definition AMReX_TypeTraits.H:56
Definition AMReX_TypeTraits.H:66
Definition AMReX_TypeTraits.H:282
Definition AMReX_TagParallelFor.H:158
Definition AMReX_FBI.H:13
Box dbox
Definition AMReX_FBI.H:15
char const * p
Definition AMReX_FBI.H:14