docs_html/doxygen/AMReX__GpuContainers_8H_source.html

#ifndef AMREX_GPU_CONTAINERS_H_

#define AMREX_GPU_CONTAINERS_H_

#include <AMReX_Config.H>


#include <AMReX_Vector.H>

#include <AMReX_PODVector.H>

#include <AMReX_GpuAllocators.H>

#include <type_traits>


#include <numeric>

#include <iterator>


namespace amrex::Gpu {


#ifdef AMREX_USE_GPU


    template <class T>

    using DeviceVector = PODVector<T, ArenaAllocator<T> >;


    template <class T>

    using NonManagedDeviceVector = PODVector<T, DeviceArenaAllocator<T> >;


    template <class T>

    using ManagedVector = PODVector<T, ManagedArenaAllocator<T> >;


    template <class T>

    using PinnedVector = PODVector<T, PinnedArenaAllocator<T> >;


    template <class T>

    using AsyncVector = PODVector<T, AsyncArenaAllocator<T> >;


    template <class T>

    using HostVector = PinnedVector<T>;


    template <class T>

    using ManagedDeviceVector = PODVector<T, ManagedArenaAllocator<T> >;


#else

    template <class T>

    using DeviceVector = PODVector<T>;


    template <class T>

    using HostVector = PODVector<T>;


    template <class T>

    using NonManagedDeviceVector = PODVector<T>;


    template <class T>

    using ManagedVector = PODVector<T>;


    template <class T>

    using ManagedDeviceVector = PODVector<T>;


    template <class T>

    using PinnedVector = PODVector<T>;


    template <class T>

    using AsyncVector = PODVector<T>;

#endif


    struct HostToDevice {};

    struct DeviceToHost {};

    struct DeviceToDevice {};

    static constexpr HostToDevice   hostToDevice{};

    static constexpr DeviceToHost   deviceToHost{};

    static constexpr DeviceToDevice deviceToDevice{};


    template<class InIter, class OutIter>


    void copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept

    {

        using value_type = typename std::iterator_traits<InIter>::value_type;


        using out_value_type = typename std::iterator_traits<OutIter>::value_type;

        static_assert(std::is_same_v<value_type, out_value_type>);

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }

        htod_memcpy(&(*result), &(*begin), size*sizeof(value_type));

    }


    template<class InIter, class OutIter>


    void copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept

    {

        using value_type = typename std::iterator_traits<InIter>::value_type;


        using out_value_type = typename std::iterator_traits<OutIter>::value_type;

        static_assert(std::is_same_v<value_type, out_value_type>);

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }

        dtoh_memcpy(&(*result), &(*begin), size*sizeof(value_type));

    }


    template<class InIter, class OutIter>


    void copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept

    {

        using value_type = typename std::iterator_traits<InIter>::value_type;


        using out_value_type = typename std::iterator_traits<OutIter>::value_type;

        static_assert(std::is_same_v<value_type, out_value_type>);

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }

        dtod_memcpy(&(*result), &(*begin), size*sizeof(value_type));

    }


    template<class InIter, class OutIter>


    void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept

    {

        using value_type = typename std::iterator_traits<InIter>::value_type;


        using out_value_type = typename std::iterator_traits<OutIter>::value_type;

        static_assert(std::is_same_v<value_type, out_value_type>);

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }

        htod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));

    }


    template<class InIter, class OutIter>


    void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept

    {

        using value_type = typename std::iterator_traits<InIter>::value_type;


        using out_value_type = typename std::iterator_traits<OutIter>::value_type;

        static_assert(std::is_same_v<value_type, out_value_type>);

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }

        dtoh_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));

    }


    template<class InIter, class OutIter>


    void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept

    {

        using value_type = typename std::iterator_traits<InIter>::value_type;


        using out_value_type = typename std::iterator_traits<OutIter>::value_type;

        static_assert(std::is_same_v<value_type, out_value_type>);

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }

        dtod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));

    }


    template<class Iter>


    void prefetchToHost (Iter begin, Iter end) noexcept

    {

        using value_type = typename std::iterator_traits<Iter>::value_type;

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }


#ifdef AMREX_USE_GPU

        // Currently only implemented for CUDA.

#if defined(AMREX_USE_CUDA) && !defined(_WIN32)

        if (Gpu::Device::devicePropMajor() >= 6) {

#if defined(CUDART_VERSION) && (CUDART_VERSION >= 13000)

            cudaMemLocation location = {};

            location.type = cudaMemLocationTypeHost;

            AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),

                                                      size*sizeof(value_type),

                                                      location, 0,

                                                      Gpu::gpuStream()));

#else

            AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),

                                                      size*sizeof(value_type),

                                                      cudaCpuDeviceId,

                                                      Gpu::gpuStream()));

#endif

        }

#endif

#endif


        Gpu::streamSynchronize();

    }


    template<class Iter>


    void prefetchToDevice (Iter begin, Iter end) noexcept

    {

        using value_type = typename std::iterator_traits<Iter>::value_type;

        static_assert(std::is_trivially_copyable<value_type>(),

                      "Can only copy trivially copyable types");


        auto size = std::distance(begin, end);

        if (size == 0) { return; }


#ifdef AMREX_USE_GPU

        // Currently only implemented for CUDA.

#if defined(AMREX_USE_CUDA) && !defined(_WIN32)

        if (Gpu::Device::devicePropMajor() >= 6) {

#if defined(CUDART_VERSION) && (CUDART_VERSION >= 13000)

            cudaMemLocation location = {};

            location.type = cudaMemLocationTypeDevice;

            location.id = Gpu::Device::deviceId();

            AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),

                                                      size*sizeof(value_type),

                                                      location, 0,

                                                      Gpu::gpuStream()));

#else

            AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),

                                                      size*sizeof(value_type),

                                                      Gpu::Device::deviceId(),

                                                      Gpu::gpuStream()));

#endif

        }

#endif

#endif


        Gpu::streamSynchronize();

    }


    template <typename IT, typename F,

              typename T = typename std::iterator_traits<IT>::value_type,

              std::enable_if_t<(sizeof(T) <= 36*8) &&  // so there is enough shared memory

                               std::is_trivially_copyable_v<T> &&

                               amrex::IsCallable<F, T&, Long>::value,

                               int> FOO = 0>


    void fillAsync (IT first, IT last, F const& f) noexcept

    {

        auto N = static_cast<Long>(std::distance(first, last));

        if (N <= 0) { return; }

        auto p = &(*first);

#ifndef AMREX_USE_GPU

        for (Long i = 0; i < N; ++i) {

            f(p[i], i);

        }

#else

        // No need to use shared memory if the type is small.

        // May not have enough shared memory if the type is too big.

        // Cannot use shared memory, if the type is not trivially copable.

        if constexpr ((sizeof(T) <= 8)

                      || (sizeof(T) > 36*8)

                      || ! std::is_trivially_copyable<T>()) {

            amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept

            {

                f(p[i], i);

            });

        } else {

            static_assert(sizeof(T) % sizeof(unsigned int) == 0);

            using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,

                                         unsigned long long, unsigned int>;

            constexpr Long nU = sizeof(T) / sizeof(U);

            auto pu = reinterpret_cast<U*>(p);

            constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;

            int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);

            std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);

#ifdef AMREX_USE_SYCL

            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),

            [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept

            {

                Long i = handler.globalIdx();

                Long blockDimx = handler.blockDim();

                Long threadIdxx = handler.threadIdx();

                Long blockIdxx = handler.blockIdx();

                auto const shared_U = (U*)handler.sharedMemory();

                auto const shared_T = (T*)shared_U;

                if (i < N) {

                    auto ga = new(shared_T+threadIdxx) T;

                    f(*ga, i);

                }

                handler.sharedBarrier();

                for (Long m = threadIdxx,

                         mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);

                     m < mend; m += blockDimx) {

                    pu[blockDimx*blockIdxx*nU+m] = shared_U[m];

                }

            });

#else

            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),

                          [=] AMREX_GPU_DEVICE () noexcept

            {

                Long blockDimx = blockDim.x;

                Long threadIdxx = threadIdx.x;

                Long blockIdxx = blockIdx.x;

                Long i = blockDimx*blockIdxx + threadIdxx;

                Gpu::SharedMemory<U> gsm;

                auto const shared_U = gsm.dataPtr();

                auto const shared_T = (T*)shared_U;

                if (i < N) {

                    auto ga = new(shared_T+threadIdxx) T;

                    f(*ga, i);

                }

                __syncthreads();

                for (Long m = threadIdxx,

                         mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);

                     m < mend; m += blockDimx) {

                    pu[blockDimx*blockIdxx*nU+m] = shared_U[m];

                }

            });

#endif

        }

#endif

    }


}


#endif

AMReX_GpuAllocators.H

AMREX_CUDA_SAFE_CALL
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18

AMReX_PODVector.H

AMReX_Vector.H

amrex::Gpu::Device::deviceId
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:692

amrex::Gpu::Device::devicePropMajor
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:203

amrex::PODVector
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308

amrex::Long
amrex_long Long
Definition AMReX_INT.H:30

amrex::Gpu
Definition AMReX_BaseFwd.H:55

amrex::Gpu::dtod_memcpy_async
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:449

amrex::Gpu::fillAsync
void fillAsync(IT first, IT last, F const &f) noexcept
Fill the elements in the given range using the given calllable.
Definition AMReX_GpuContainers.H:422

amrex::Gpu::copy
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:128

amrex::Gpu::copyAsync
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228

amrex::Gpu::prefetchToHost
void prefetchToHost(Iter begin, Iter end) noexcept
Migrate elements of a container from device to host. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:322

amrex::Gpu::deviceToDevice
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:107

amrex::Gpu::deviceToHost
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106

amrex::Gpu::hostToDevice
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105

amrex::Gpu::streamSynchronize
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310

amrex::Gpu::dtoh_memcpy_async
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:435

amrex::Gpu::dtoh_memcpy
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:496

amrex::Gpu::htod_memcpy
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:488

amrex::Gpu::htod_memcpy_async
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:421

amrex::Gpu::dtod_memcpy
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:504

amrex::Gpu::gpuStream
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291

amrex::Gpu::prefetchToDevice
void prefetchToDevice(Iter begin, Iter end) noexcept
Migrate elements of a container from host to device. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:367

amrex::Order::F
@ F

amrex::ParallelFor
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193

amrex::min
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24

amrex::begin
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006

amrex::end
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015

amrex::Gpu::DeviceToDevice
Definition AMReX_GpuContainers.H:104

amrex::Gpu::DeviceToHost
Definition AMReX_GpuContainers.H:103

amrex::Gpu::Handler
Definition AMReX_GpuTypes.H:86

amrex::Gpu::HostToDevice
Definition AMReX_GpuContainers.H:102

amrex::Gpu::SharedMemory
Definition AMReX_GpuMemory.H:125

amrex::Gpu::SharedMemory::dataPtr
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126

amrex::IsCallable
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213