Actual source code: cupminterface.hpp
1: #ifndef PETSCCUPMINTERFACE_HPP
2: #define PETSCCUPMINTERFACE_HPP
4: #if defined(__cplusplus)
5: #include <petsc/private/cpputil.hpp>
6: #include <petsc/private/petscadvancedmacros.h>
7: #include <petscdevice_cupm.h>
9: #include <array>
11: namespace Petsc
12: {
14: namespace device
15: {
17: namespace cupm
18: {
20: // enum describing available cupm devices, this is used as the template parameter to any
21: // class subclassing the Interface or using it as a member variable
22: enum class DeviceType : int {
23: CUDA,
24: HIP
25: };
27: // clang-format off
28: static constexpr std::array<const char *const, 5> DeviceTypes = {
29: "cuda",
30: "hip",
31: "Petsc::Device::CUPM::DeviceType",
32: "Petsc::Device::CUPM::DeviceType::",
33: nullptr
34: };
35: // clang-format on
37: namespace impl
38: {
40: // A backend agnostic PetscCallCUPM() function, this will only work inside the member
41: // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
42: // functions can also be wrapped inline:
43: //
44: // PetscCallCUPM(foo<int,char,bool>());
45: #define PetscCallCUPM(...) \
46: do { \
47: const cupmError_t cerr_p_ = __VA_ARGS__; \
48: PetscCheck(cerr_p_ == cupmSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_p_), cupmGetErrorName(cerr_p_), cupmGetErrorString(cerr_p_)); \
49: } while (0)
51: #define PetscCallCUPMAbort(comm_, ...) \
52: do { \
53: const cupmError_t cerr_abort_p_ = __VA_ARGS__; \
54: PetscCheckAbort(cerr_abort_p_ == cupmSuccess, comm_, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_abort_p_), cupmGetErrorName(cerr_abort_p_), cupmGetErrorString(cerr_abort_p_)); \
55: } while (0)
57: // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
58: //
59: // input params:
60: // our_name - the name of the alias
61: // their_name - the name of the function being aliased
62: //
63: // notes:
64: // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
65: //
66: // example usage:
67: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) ->
68: // template <typename... T>
69: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
70: // {
71: // return cudaMalloc(std::forward<T>(args)...);
72: // }
73: //
74: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) ->
75: // template <typename... T>
76: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
77: // {
78: // return hipMalloc(std::forward<T>(args)...);
79: // }
80: #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name)
82: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but
83: // discard the last N arguments
84: //
85: // input params:
86: // our_name - the name of the alias
87: // their_name - the name of the function being aliased
88: // N - integer constant [0, INT_MAX) dictating how many arguments to chop off the end
89: //
90: // notes:
91: // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
92: //
93: // example use:
94: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) ->
95: // template <typename... T, typename Tend>
96: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
97: // return type deduction*
98: // {
99: // (void)argend;
100: // return cudaMalloc(std::forward<T>(args)...);
101: // }
102: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N)
104: // Base class that holds functions and variables that don't require CUDA or HIP to be present
105: // on the system
106: template <DeviceType T>
107: struct InterfaceBase {
108: static const DeviceType type = T;
110: PETSC_NODISCARD static constexpr const char *cupmName() noexcept
111: {
112: static_assert(util::to_underlying(DeviceType::CUDA) == 0, "");
113: static_assert(util::to_underlying(DeviceType::HIP) == 1, "");
114: return std::get<util::to_underlying(T)>(DeviceTypes);
115: }
117: PETSC_NODISCARD static constexpr const char *cupmNAME() noexcept { return T == DeviceType::CUDA ? "CUDA" : "HIP"; }
119: PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }
121: PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
122: };
124: // declare the base class static member variables
125: template <DeviceType T>
126: const DeviceType InterfaceBase<T>::type;
128: #define PETSC_CUPM_BASE_CLASS_HEADER(T) \
129: using ::Petsc::device::cupm::impl::InterfaceBase<T>::type; \
130: using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmName; \
131: using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmNAME; \
132: using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_DEVICE_CUPM; \
133: using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_MEMTYPE_CUPM
135: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
136: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
137: // import them into classes.
138: template <DeviceType>
139: struct InterfaceImpl;
141: #if PetscDefined(HAVE_CUDA)
142: template <>
143: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
144: PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::CUDA);
146: // typedefs
147: using cupmError_t = cudaError_t;
148: using cupmEvent_t = cudaEvent_t;
149: using cupmStream_t = cudaStream_t;
150: using cupmDeviceProp_t = cudaDeviceProp;
151: using cupmMemcpyKind_t = cudaMemcpyKind;
152: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
153: using cupmPointerAttributes_t = cudaPointerAttributes;
154: using cupmMemoryType_t = enum cudaMemoryType;
155: using cupmDim3 = dim3;
156: using cupmHostFn_t = cudaHostFn_t;
157: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
158: using cupmMemPool_t = cudaMemPool_t;
159: using cupmMemPoolAttr = cudaMemPoolAttr;
160: #else
161: using cupmMemPool_t = void *;
162: using cupmMemPoolAttr = unsigned int;
163: #endif
165: // values
166: static const auto cupmSuccess = cudaSuccess;
167: static const auto cupmErrorNotReady = cudaErrorNotReady;
168: static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse;
169: static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess;
170: static const auto cupmErrorStubLibrary =
171: #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
172: cudaErrorStubLibrary;
173: #else
174: cudaErrorInsufficientDriver;
175: #endif
177: static const auto cupmErrorNoDevice = cudaErrorNoDevice;
178: static const auto cupmStreamDefault = cudaStreamDefault;
179: static const auto cupmStreamNonBlocking = cudaStreamNonBlocking;
180: static const auto cupmDeviceMapHost = cudaDeviceMapHost;
181: static const auto cupmMemcpyHostToDevice = cudaMemcpyHostToDevice;
182: static const auto cupmMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
183: static const auto cupmMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
184: static const auto cupmMemcpyHostToHost = cudaMemcpyHostToHost;
185: static const auto cupmMemcpyDefault = cudaMemcpyDefault;
186: static const auto cupmMemoryTypeHost = cudaMemoryTypeHost;
187: static const auto cupmMemoryTypeDevice = cudaMemoryTypeDevice;
188: static const auto cupmMemoryTypeManaged = cudaMemoryTypeManaged;
189: static const auto cupmEventDisableTiming = cudaEventDisableTiming;
190: static const auto cupmHostAllocDefault = cudaHostAllocDefault;
191: static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined;
192: static const auto cupmMemPoolAttrReleaseThreshold =
193: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
194: cudaMemPoolAttrReleaseThreshold;
195: #else
196: cupmMemPoolAttr{0};
197: #endif
199: // error functions
200: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName)
201: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString)
202: PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError)
204: // device management
205: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount)
206: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties)
207: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice)
208: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice)
209: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags)
210: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags)
211: PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes)
212: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
213: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool)
214: PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute)
215: #else
216: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
217: {
218: *pool = nullptr;
219: return cupmSuccess;
220: }
222: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
223: #endif
224: // CUDA has no cudaInit() to match hipInit()
225: PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }
227: // stream management
228: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate)
229: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags)
230: PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy)
231: PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord)
232: PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize)
233: PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime)
234: PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery)
235: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate)
236: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags)
237: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags)
238: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy)
239: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent)
240: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery)
241: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize)
242: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize)
243: PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress)
245: // memory management
246: PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree)
247: PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc)
248: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
249: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync)
250: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync)
251: #else
252: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1)
253: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1)
254: #endif
255: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy)
256: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync)
257: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost)
258: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost)
259: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset)
260: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
261: PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync)
262: #else
263: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1)
264: #endif
265: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, cudaMemcpy2D)
266: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, cudaMemcpy2DAsync, 1)
267: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, cudaMemset2D)
268: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, cudaMemset2DAsync, 1)
270: // launch control
271: PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc)
272: template <typename FunctionT, typename... KernelArgsT>
273: PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
274: {
275: static_assert(!std::is_pointer<FunctionT>::value, "kernel function must not be passed by pointer");
277: void *args[] = {(void *)&kernelArgs...};
278: return cudaLaunchKernel<util::remove_reference_t<FunctionT>>(std::addressof(func), std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
279: }
280: };
281: #endif // PetscDefined(HAVE_CUDA)
283: #if PetscDefined(HAVE_HIP)
284: template <>
285: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
286: PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::HIP);
288: // typedefs
289: using cupmError_t = hipError_t;
290: using cupmEvent_t = hipEvent_t;
291: using cupmStream_t = hipStream_t;
292: using cupmDeviceProp_t = hipDeviceProp_t;
293: using cupmMemcpyKind_t = hipMemcpyKind;
294: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
295: using cupmPointerAttributes_t = hipPointerAttribute_t;
296: using cupmMemoryType_t = enum hipMemoryType;
297: using cupmDim3 = dim3;
298: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
299: using cupmHostFn_t = hipHostFn_t;
300: using cupmMemPool_t = hipMemPool_t;
301: using cupmMemPoolAttr = hipMemPoolAttr;
302: #else
303: using cupmHostFn_t = void (*)(void *);
304: using cupmMemPool_t = void *;
305: using cupmMemPoolAttr = unsigned int;
306: #endif
308: // values
309: static const auto cupmSuccess = hipSuccess;
310: static const auto cupmErrorNotReady = hipErrorNotReady;
311: // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
312: static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse;
313: static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess;
314: // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
315: static const auto cupmErrorStubLibrary = hipErrorInsufficientDriver;
316: static const auto cupmErrorNoDevice = hipErrorNoDevice;
317: static const auto cupmStreamDefault = hipStreamDefault;
318: static const auto cupmStreamNonBlocking = hipStreamNonBlocking;
319: static const auto cupmDeviceMapHost = hipDeviceMapHost;
320: static const auto cupmMemcpyHostToDevice = hipMemcpyHostToDevice;
321: static const auto cupmMemcpyDeviceToHost = hipMemcpyDeviceToHost;
322: static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
323: static const auto cupmMemcpyHostToHost = hipMemcpyHostToHost;
324: static const auto cupmMemcpyDefault = hipMemcpyDefault;
325: static const auto cupmMemoryTypeHost = hipMemoryTypeHost;
326: static const auto cupmMemoryTypeDevice = hipMemoryTypeDevice;
327: // see
328: // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
329: static const auto cupmMemoryTypeManaged = hipMemoryTypeUnified;
330: static const auto cupmEventDisableTiming = hipEventDisableTiming;
331: static const auto cupmHostAllocDefault = hipHostMallocDefault;
332: static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined;
333: static const auto cupmMemPoolAttrReleaseThreshold =
334: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
335: hipMemPoolAttrReleaseThreshold;
336: #else
337: cupmMemPoolAttr{0};
338: #endif
340: // error functions
341: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName)
342: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString)
343: PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError)
345: // device management
346: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount)
347: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties)
348: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice)
349: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice)
350: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags)
351: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags)
352: PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes)
353: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
354: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool)
355: PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute)
356: #else
357: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
358: {
359: *pool = nullptr;
360: return cupmSuccess;
361: }
363: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
364: #endif
365: PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit)
367: // stream management
368: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate)
369: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags)
370: PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy)
371: PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord)
372: PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize)
373: PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime)
374: PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery)
375: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate)
376: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags)
377: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags)
378: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy)
379: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent)
380: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery)
381: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize)
382: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize)
383: PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress)
385: // memory management
386: PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree)
387: PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc)
388: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
389: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync)
390: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync)
391: #else
392: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1)
393: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1)
394: #endif
395: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy)
396: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync)
397: // hipMallocHost is deprecated
398: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc)
399: // hipFreeHost is deprecated
400: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree)
401: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset)
402: PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync)
403: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, hipMemcpy2D)
404: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, hipMemcpy2DAsync, 1)
405: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, hipMemset2D)
406: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, hipMemset2DAsync, 1)
408: // launch control
409: // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
410: // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
411: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
412: PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc)
413: #else
414: PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
415: {
416: // the only correct way to spoof this function is to do it synchronously...
417: auto herr = hipStreamSynchronize(stream);
418: if (PetscUnlikely(herr != hipSuccess)) return herr;
419: fn(ctx);
420: return herr;
421: }
422: #endif
424: template <typename FunctionT, typename... KernelArgsT>
425: PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
426: {
427: void *args[] = {(void *)&kernelArgs...};
428: return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
429: }
430: };
431: #endif // PetscDefined(HAVE_HIP)
433: // shorthand for bringing all of the typedefs from the base Interface class into your own,
434: // it's annoying that c++ doesn't have a way to do this automatically
435: #define PETSC_CUPM_IMPL_CLASS_HEADER(T) \
436: PETSC_CUPM_BASE_CLASS_HEADER(T); \
437: /* types */ \
438: using cupmComplex_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmComplex_t; \
439: using cupmError_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmError_t; \
440: using cupmEvent_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEvent_t; \
441: using cupmStream_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStream_t; \
442: using cupmDeviceProp_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceProp_t; \
443: using cupmMemcpyKind_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyKind_t; \
444: using cupmPointerAttributes_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerAttributes_t; \
445: using cupmMemoryType_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryType_t; \
446: using cupmDim3 = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDim3; \
447: using cupmMemPool_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPool_t; \
448: using cupmMemPoolAttr = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttr; \
449: /* variables */ \
450: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSuccess; \
451: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNotReady; \
452: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorDeviceAlreadyInUse; \
453: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorSetOnActiveProcess; \
454: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorStubLibrary; \
455: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNoDevice; \
456: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDefault; \
457: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamNonBlocking; \
458: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceMapHost; \
459: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToDevice; \
460: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToHost; \
461: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToDevice; \
462: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToHost; \
463: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDefault; \
464: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeHost; \
465: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeDevice; \
466: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeManaged; \
467: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDisableTiming; \
468: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocDefault; \
469: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocWriteCombined; \
470: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttrReleaseThreshold; \
471: /* functions */ \
472: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorName; \
473: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorString; \
474: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetLastError; \
475: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceCount; \
476: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceProperties; \
477: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDevice; \
478: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDevice; \
479: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceFlags; \
480: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDeviceFlags; \
481: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerGetAttributes; \
482: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetMemPool; \
483: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolSetAttribute; \
484: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmInit; \
485: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreate; \
486: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreateWithFlags; \
487: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDestroy; \
488: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventRecord; \
489: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventSynchronize; \
490: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventElapsedTime; \
491: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventQuery; \
492: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreate; \
493: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreateWithFlags; \
494: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamGetFlags; \
495: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDestroy; \
496: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamWaitEvent; \
497: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamQuery; \
498: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamSynchronize; \
499: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceSynchronize; \
500: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetSymbolAddress; \
501: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMalloc; \
502: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocAsync; \
503: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy; \
504: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyAsync; \
505: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocHost; \
506: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset; \
507: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemsetAsync; \
508: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2D; \
509: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2DAsync; \
510: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2D; \
511: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2DAsync; \
512: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmLaunchHostFunc
514: // The actual interface class
515: template <DeviceType T>
516: struct Interface : InterfaceImpl<T> {
517: private:
518: using interface_type = InterfaceImpl<T>;
520: public:
521: PETSC_CUPM_IMPL_CLASS_HEADER(T);
523: using cupmReal_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
524: using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;
526: PETSC_NODISCARD static constexpr cupmScalar_t cupmScalarCast(PetscScalar s) noexcept
527: {
528: #if PetscDefined(USE_COMPLEX)
529: return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
530: #else
531: return static_cast<cupmScalar_t>(s);
532: #endif
533: }
535: PETSC_NODISCARD static constexpr const cupmScalar_t *cupmScalarPtrCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }
537: PETSC_NODISCARD static constexpr cupmScalar_t *cupmScalarPtrCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }
539: PETSC_NODISCARD static constexpr const cupmReal_t *cupmRealPtrCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }
541: PETSC_NODISCARD static constexpr cupmReal_t *cupmRealPtrCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }
543: #if !defined(PETSC_PKG_CUDA_VERSION_GE)
544: #define PETSC_PKG_CUDA_VERSION_GE(...) 0
545: #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
546: #endif
547: static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
548: {
549: cupmPointerAttributes_t attr;
550: cupmError_t cerr;
552: PetscFunctionBegin;
554: if (registered) {
556: *registered = PETSC_FALSE;
557: }
558: if (managed) {
560: *managed = PETSC_FALSE;
561: }
562: // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
563: // a host pointer returns cudaErrorInvalidValue
564: cerr = cupmPointerGetAttributes(&attr, data);
565: cerr = cupmGetLastError();
566: // HIP seems to always have used memoryType though
567: #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
568: const auto mtype = attr.memoryType;
569: if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
570: #else
571: if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) PetscCallCUPM(cerr);
572: const auto mtype = attr.type;
573: if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
574: #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
575: if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
576: if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
577: PetscFunctionReturn(PETSC_SUCCESS);
578: }
579: #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
580: #undef PETSC_PKG_CUDA_VERSION_GE
581: #endif
583: PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
584: {
585: switch (mode) {
586: case PETSC_DEVICE_COPY_HTOH:
587: return cupmMemcpyHostToHost;
588: case PETSC_DEVICE_COPY_HTOD:
589: return cupmMemcpyHostToDevice;
590: case PETSC_DEVICE_COPY_DTOD:
591: return cupmMemcpyDeviceToDevice;
592: case PETSC_DEVICE_COPY_DTOH:
593: return cupmMemcpyDeviceToHost;
594: case PETSC_DEVICE_COPY_AUTO:
595: return cupmMemcpyDefault;
596: }
597: PetscUnreachable();
598: return cupmMemcpyDefault;
599: }
601: // these change what the arguments mean, so need to namespace these
602: template <typename M>
603: static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
604: {
605: static_assert(!std::is_void<M>::value, "");
607: PetscFunctionBegin;
609: *ptr = nullptr;
610: if (n) {
611: const auto bytes = n * sizeof(M);
612: // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
613: //
614: // TLD;DR: cudaMallocAsync() does not work with NVIDIA GPUDirect which OPENMPI uses to
615: // underpin its cuda-aware MPI implementation, so we cannot just async allocate
616: // blindly...
617: if (stream) {
618: PetscCallCUPM(cupmMallocAsync(reinterpret_cast<void **>(ptr), bytes, stream));
619: } else {
620: PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(ptr), bytes));
621: }
622: }
623: PetscFunctionReturn(PETSC_SUCCESS);
624: }
626: template <typename M>
627: static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
628: {
629: PetscFunctionBegin;
630: PetscCall(PetscCUPMMallocAsync(ptr, n));
631: PetscFunctionReturn(PETSC_SUCCESS);
632: }
634: template <typename M>
635: static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
636: {
637: static_assert(!std::is_void<M>::value, "");
639: PetscFunctionBegin;
641: *ptr = nullptr;
642: if (n) PetscCallCUPM(cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags));
643: PetscFunctionReturn(PETSC_SUCCESS);
644: }
646: template <typename D>
647: static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
648: {
649: static_assert(!std::is_void<D>::value, "");
650: const auto size = n * sizeof(D);
652: PetscFunctionBegin;
653: if (PetscUnlikely(!n)) PetscFunctionReturn(PETSC_SUCCESS);
655: PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
656: PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
657: // do early return after nullptr check since we need to check that they are not both nullptrs
658: if (PetscUnlikely(dest == src)) PetscFunctionReturn(PETSC_SUCCESS);
659: if (kind == cupmMemcpyHostToHost) {
660: // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
661: // than it is to just call the vendor functions. This assumes of course that the stream
662: // accounts for both memory regions being "idle"
663: if (cupmStreamQuery(stream) == cupmSuccess) {
664: PetscCall(PetscMemcpy(dest, src, size));
665: PetscFunctionReturn(PETSC_SUCCESS);
666: }
667: // need to clear the potential cupmErrorNotReady generated by query above...
668: auto cerr = cupmGetLastError();
670: if (PetscUnlikely(cerr != cupmErrorNotReady)) PetscCallCUPM(cerr);
671: }
672: if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
673: PetscCallCUPM(cupmMemcpyAsync(dest, src, size, kind, stream));
674: } else {
675: PetscCallCUPM(cupmMemcpy(dest, src, size, kind));
676: }
677: PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
678: PetscFunctionReturn(PETSC_SUCCESS);
679: }
681: template <typename D>
682: static PetscErrorCode PetscCUPMMemcpy(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
683: {
684: PetscFunctionBegin;
685: PetscCall(PetscCUPMMemcpyAsync(dest, src, n, kind));
686: PetscFunctionReturn(PETSC_SUCCESS);
687: }
689: template <typename D>
690: static PetscErrorCode PetscCUPMMemcpy2DAsync(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr)
691: {
692: static_assert(!std::is_void<D>::value, "");
693: const auto dest_pitch_bytes = dest_pitch * sizeof(D);
694: const auto src_pitch_bytes = src_pitch * sizeof(D);
695: const auto width_bytes = width * sizeof(D);
696: const auto size = height * width_bytes;
698: PetscFunctionBegin;
699: if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
700: PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
701: PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
702: if (stream || (kind != cupmMemcpyDeviceToHost)) {
703: PetscCallCUPM(cupmMemcpy2DAsync(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind, stream));
704: } else {
705: PetscCallCUPM(cupmMemcpy2D(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind));
706: }
707: PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
708: PetscFunctionReturn(PETSC_SUCCESS);
709: }
711: template <typename D>
712: static PetscErrorCode PetscCUPMMemcpy2D(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind)
713: {
714: PetscFunctionBegin;
715: PetscCall(PetscCUPMMemcpy2DAsync(dest, dest_pitch, src, src_pitch, width, height, kind));
716: PetscFunctionReturn(PETSC_SUCCESS);
717: }
719: template <typename M>
720: static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
721: {
722: static_assert(!std::is_void<M>::value, "");
724: PetscFunctionBegin;
725: if (PetscLikely(n)) {
726: const auto bytes = n * sizeof(M);
728: PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", n);
729: if (stream || use_async) {
730: PetscCallCUPM(cupmMemsetAsync(ptr, value, bytes, stream));
731: } else {
732: PetscCallCUPM(cupmMemset(ptr, value, bytes));
733: }
734: }
735: PetscFunctionReturn(PETSC_SUCCESS);
736: }
738: template <typename M>
739: static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
740: {
741: PetscFunctionBegin;
742: PetscCall(PetscCUPMMemsetAsync(ptr, value, n));
743: PetscFunctionReturn(PETSC_SUCCESS);
744: }
746: template <typename D>
747: static PetscErrorCode PetscCUPMMemset2DAsync(D *ptr, std::size_t pitch, int value, std::size_t width, std::size_t height, cupmStream_t stream = nullptr)
748: {
749: static_assert(!std::is_void<D>::value, "");
750: const auto pitch_bytes = pitch * sizeof(D);
751: const auto width_bytes = width * sizeof(D);
752: const auto size = width_bytes * height;
754: PetscFunctionBegin;
755: if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
756: PetscAssert(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", size);
757: if (stream) {
758: PetscCallCUPM(cupmMemset2DAsync(ptr, pitch_bytes, value, width_bytes, height, stream));
759: } else {
760: PetscCallCUPM(cupmMemset2D(ptr, pitch_bytes, value, width_bytes, height));
761: }
762: PetscFunctionReturn(PETSC_SUCCESS);
763: }
765: // these we can transparently wrap, no need to namespace it to Petsc
766: template <typename M>
767: PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &ptr, cupmStream_t stream = nullptr) noexcept
768: {
769: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
770: static_assert(!std::is_const<M>::value, "");
772: if (ptr) {
773: auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);
775: ptr = nullptr;
776: if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
777: }
778: return cupmSuccess;
779: }
781: PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) { return interface_type::cupmFreeAsync(ptr, stream); }
783: template <typename M>
784: PETSC_NODISCARD static cupmError_t cupmFree(M &ptr) noexcept
785: {
786: return cupmFreeAsync(ptr);
787: }
789: PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) { return cupmFreeAsync(ptr); }
791: template <typename M>
792: PETSC_NODISCARD static cupmError_t cupmFreeHost(M &ptr) noexcept
793: {
794: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
795: const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
796: ptr = nullptr;
797: return cerr;
798: }
800: PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) { return interface_type::cupmFreeHost(ptr); }
802: // specific wrapper for device launch function, as the real function is a C routine and
803: // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
804: // boils down to the fact that ultimately we pass a
805: //
806: // void *args[] = {(void*)&kernel_args...};
807: //
808: // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
809: // happen to the kernel arguments so we must do it ourselves here. This function does this in
810: // 3 stages:
811: // 1. Enumerate the kernel arguments (cupmLaunchKernel)
812: // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
813: // expected by func() using the enumeration above (deduceKernelCall)
814: // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
815: // it. (interface_type::cupmLaunchKernel)
816: template <typename F, typename... Args>
817: PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
818: {
819: return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
820: }
822: template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
823: static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
824: {
825: static_assert(block_size > 0, "");
826: static_assert(warp_size > 0, "");
827: // want block_size to be a multiple of the warp_size
828: static_assert(block_size % warp_size == 0, "");
829: const auto nthread = std::min(n, block_size);
830: const auto nblock = (n + block_size - 1) / block_size;
832: PetscFunctionBegin;
833: // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
834: // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
835: PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
836: PetscCallCUPM(cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...));
837: PetscFunctionReturn(PETSC_SUCCESS);
838: }
840: private:
841: template <typename S, typename D, typename = void>
842: struct is_static_castable : std::false_type { };
844: template <typename S, typename D>
845: struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };
847: template <typename D, typename S>
848: static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
849: {
850: return static_cast<D>(std::forward<S>(src));
851: }
853: template <typename D, typename S>
854: static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
855: {
856: return const_cast<D>(std::forward<S>(src));
857: }
859: template <typename F, typename... Args, std::size_t... Idx>
860: PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
861: {
862: // clang-format off
863: return interface_type::template cupmLaunchKernel(
864: std::forward<F>(func),
865: std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
866: // can't static_cast() here since the function argument type may be cv-qualified, in
867: // which case we would need to const_cast(). But you can only const_cast() indirect types
868: // (pointers, references). So we need a SFINAE monster that is a static_cast() if
869: // possible, and a const_cast() if not. We could just use a C-style cast which *would*
870: // work here since it tries the following and uses the first one that succeeds:
871: //
872: // 1. const_cast()
873: // 2. static_cast()
874: // 3. static_cast() then const_cast()
875: // 4. reinterpret_cast()...
876: //
877: // the issue however is the final reinterpret_cast(). We absolutely cannot get there
878: // because doing so would silently hide a ton of bugs, for example casting a PetscScalar
879: // * to double * in complex builds, a PetscInt * to int * in 64idx builds, etc.
880: cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
881: );
882: // clang-format on
883: }
885: static PetscErrorCode PetscLogCUPMMemcpyTransfer(cupmMemcpyKind_t kind, std::size_t size) noexcept
886: {
887: PetscFunctionBegin;
888: // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
889: // (yet) or don't know the direction
890: if (kind == cupmMemcpyDeviceToHost) PetscCall(PetscLogGpuToCpu(static_cast<PetscLogDouble>(size)));
891: else if (kind == cupmMemcpyHostToDevice) PetscCall(PetscLogCpuToGpu(static_cast<PetscLogDouble>(size)));
892: else (void)size;
893: PetscFunctionReturn(PETSC_SUCCESS);
894: }
895: };
897: #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(T) \
898: PETSC_CUPM_IMPL_CLASS_HEADER(T); \
899: using cupmReal_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmReal_t; \
900: using cupmScalar_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmScalar_t; \
901: using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarCast; \
902: using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarPtrCast; \
903: using ::Petsc::device::cupm::impl::Interface<T>::cupmRealPtrCast; \
904: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMGetMemType; \
905: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset; \
906: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemsetAsync; \
907: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMalloc; \
908: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocAsync; \
909: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocHost; \
910: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy; \
911: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpyAsync; \
912: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2D; \
913: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2DAsync; \
914: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset2DAsync; \
915: using ::Petsc::device::cupm::impl::Interface<T>::cupmFree; \
916: using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeAsync; \
917: using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeHost; \
918: using ::Petsc::device::cupm::impl::Interface<T>::cupmLaunchKernel; \
919: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMLaunchKernel1D; \
920: using ::Petsc::device::cupm::impl::Interface<T>::PetscDeviceCopyModeToCUPMMemcpyKind
922: } // namespace impl
924: } // namespace cupm
926: } // namespace device
928: } // namespace Petsc
930: #endif /* __cplusplus */
932: #endif /* PETSCCUPMINTERFACE_HPP */