Actual source code: cupminterface.hpp

  1: #ifndef PETSCCUPMINTERFACE_HPP
  2: #define PETSCCUPMINTERFACE_HPP

  4: #if defined(__cplusplus)
  5: #include <petsc/private/cpputil.hpp>
  6: #include <petsc/private/petscadvancedmacros.h>
  7: #include <petscdevice_cupm.h>

  9:   #include <array>

 11: namespace Petsc
 12: {

 14: namespace device
 15: {

 17: namespace cupm
 18: {

 20: // enum describing available cupm devices, this is used as the template parameter to any
 21: // class subclassing the Interface or using it as a member variable
 22: enum class DeviceType : int {
 23:   CUDA,
 24:   HIP
 25: };

 27: // clang-format off
 28: static constexpr std::array<const char *const, 5> DeviceTypes = {
 29:   "cuda",
 30:   "hip",
 31:   "Petsc::Device::CUPM::DeviceType",
 32:   "Petsc::Device::CUPM::DeviceType::",
 33:   nullptr
 34: };
 35: // clang-format on

 37: namespace impl
 38: {

 40:   // A backend agnostic PetscCallCUPM() function, this will only work inside the member
 41:   // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
 42:   // functions can also be wrapped inline:
 43:   //
 44:   // PetscCallCUPM(foo<int,char,bool>());
 45:   #define PetscCallCUPM(...) \
 46:     do { \
 47:       const cupmError_t cerr_p_ = __VA_ARGS__; \
 48:       PetscCheck(cerr_p_ == cupmSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_p_), cupmGetErrorName(cerr_p_), cupmGetErrorString(cerr_p_)); \
 49:     } while (0)

 51:   #define PetscCallCUPMAbort(comm_, ...) \
 52:     do { \
 53:       const cupmError_t cerr_abort_p_ = __VA_ARGS__; \
 54:       PetscCheckAbort(cerr_abort_p_ == cupmSuccess, comm_, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_abort_p_), cupmGetErrorName(cerr_abort_p_), cupmGetErrorString(cerr_abort_p_)); \
 55:     } while (0)

 57:   // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
 58:   //
 59:   // input params:
 60:   // our_name   - the name of the alias
 61:   // their_name - the name of the function being aliased
 62:   //
 63:   // notes:
 64:   // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
 65:   //
 66:   // example usage:
 67:   // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) ->
 68:   // template <typename... T>
 69:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
 70:   // {
 71:   //   return cudaMalloc(std::forward<T>(args)...);
 72:   // }
 73:   //
 74:   // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) ->
 75:   // template <typename... T>
 76:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
 77:   // {
 78:   //   return hipMalloc(std::forward<T>(args)...);
 79:   // }
 80:   #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name)

 82:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but
 83:   // discard the last N arguments
 84:   //
 85:   // input params:
 86:   // our_name   - the name of the alias
 87:   // their_name - the name of the function being aliased
 88:   // N          - integer constant [0, INT_MAX) dictating how many arguments to chop off the end
 89:   //
 90:   // notes:
 91:   // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
 92:   //
 93:   // example use:
 94:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) ->
 95:   // template <typename... T, typename Tend>
 96:   // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
 97:   // return type deduction*
 98:   // {
 99:   //   (void)argend;
100:   //   return cudaMalloc(std::forward<T>(args)...);
101:   // }
102:   #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N)

104: // Base class that holds functions and variables that don't require CUDA or HIP to be present
105: // on the system
106: template <DeviceType T>
107: struct InterfaceBase {
108:   static const DeviceType type = T;

110:   PETSC_NODISCARD static constexpr const char *cupmName() noexcept
111:   {
112:     static_assert(util::to_underlying(DeviceType::CUDA) == 0, "");
113:     static_assert(util::to_underlying(DeviceType::HIP) == 1, "");
114:     return std::get<util::to_underlying(T)>(DeviceTypes);
115:   }

117:   PETSC_NODISCARD static constexpr const char *cupmNAME() noexcept { return T == DeviceType::CUDA ? "CUDA" : "HIP"; }

119:   PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }

121:   PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
122: };

124: // declare the base class static member variables
125: template <DeviceType T>
126: const DeviceType InterfaceBase<T>::type;

128:   #define PETSC_CUPM_BASE_CLASS_HEADER(T) \
129:     using ::Petsc::device::cupm::impl::InterfaceBase<T>::type; \
130:     using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmName; \
131:     using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmNAME; \
132:     using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_DEVICE_CUPM; \
133:     using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_MEMTYPE_CUPM

135: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
136: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
137: // import them into classes.
138: template <DeviceType>
139: struct InterfaceImpl;

141:   #if PetscDefined(HAVE_CUDA)
142: template <>
143: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
144:   PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::CUDA);

146:   // typedefs
147:   using cupmError_t             = cudaError_t;
148:   using cupmEvent_t             = cudaEvent_t;
149:   using cupmStream_t            = cudaStream_t;
150:   using cupmDeviceProp_t        = cudaDeviceProp;
151:   using cupmMemcpyKind_t        = cudaMemcpyKind;
152:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
153:   using cupmPointerAttributes_t = cudaPointerAttributes;
154:   using cupmMemoryType_t        = enum cudaMemoryType;
155:   using cupmDim3                = dim3;
156:   using cupmHostFn_t            = cudaHostFn_t;
157:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
158:   using cupmMemPool_t   = cudaMemPool_t;
159:   using cupmMemPoolAttr = cudaMemPoolAttr;
160:     #else
161:   using cupmMemPool_t   = void *;
162:   using cupmMemPoolAttr = unsigned int;
163:     #endif

165:   // values
166:   static const auto cupmSuccess                 = cudaSuccess;
167:   static const auto cupmErrorNotReady           = cudaErrorNotReady;
168:   static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse;
169:   static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess;
170:   static const auto cupmErrorStubLibrary =
171:     #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
172:     cudaErrorStubLibrary;
173:     #else
174:     cudaErrorInsufficientDriver;
175:     #endif

177:   static const auto cupmErrorNoDevice          = cudaErrorNoDevice;
178:   static const auto cupmStreamDefault          = cudaStreamDefault;
179:   static const auto cupmStreamNonBlocking      = cudaStreamNonBlocking;
180:   static const auto cupmDeviceMapHost          = cudaDeviceMapHost;
181:   static const auto cupmMemcpyHostToDevice     = cudaMemcpyHostToDevice;
182:   static const auto cupmMemcpyDeviceToHost     = cudaMemcpyDeviceToHost;
183:   static const auto cupmMemcpyDeviceToDevice   = cudaMemcpyDeviceToDevice;
184:   static const auto cupmMemcpyHostToHost       = cudaMemcpyHostToHost;
185:   static const auto cupmMemcpyDefault          = cudaMemcpyDefault;
186:   static const auto cupmMemoryTypeHost         = cudaMemoryTypeHost;
187:   static const auto cupmMemoryTypeDevice       = cudaMemoryTypeDevice;
188:   static const auto cupmMemoryTypeManaged      = cudaMemoryTypeManaged;
189:   static const auto cupmEventDisableTiming     = cudaEventDisableTiming;
190:   static const auto cupmHostAllocDefault       = cudaHostAllocDefault;
191:   static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined;
192:   static const auto cupmMemPoolAttrReleaseThreshold =
193:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
194:     cudaMemPoolAttrReleaseThreshold;
195:     #else
196:     cupmMemPoolAttr{0};
197:     #endif

199:   // error functions
200:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName)
201:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString)
202:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError)

204:   // device management
205:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount)
206:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties)
207:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice)
208:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice)
209:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags)
210:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags)
211:   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes)
212:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
213:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool)
214:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute)
215:     #else
216:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
217:   {
218:     *pool = nullptr;
219:     return cupmSuccess;
220:   }

222:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
223:     #endif
224:   // CUDA has no cudaInit() to match hipInit()
225:   PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }

227:   // stream management
228:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate)
229:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags)
230:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy)
231:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord)
232:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize)
233:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime)
234:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery)
235:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate)
236:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags)
237:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags)
238:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy)
239:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent)
240:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery)
241:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize)
242:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize)
243:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress)

245:   // memory management
246:   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree)
247:   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc)
248:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
249:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync)
250:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync)
251:     #else
252:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1)
253:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1)
254:     #endif
255:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy)
256:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync)
257:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost)
258:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost)
259:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset)
260:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
261:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync)
262:     #else
263:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1)
264:     #endif
265:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, cudaMemcpy2D)
266:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, cudaMemcpy2DAsync, 1)
267:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, cudaMemset2D)
268:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, cudaMemset2DAsync, 1)

270:   // launch control
271:   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc)
272:   template <typename FunctionT, typename... KernelArgsT>
273:   PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
274:   {
275:     static_assert(!std::is_pointer<FunctionT>::value, "kernel function must not be passed by pointer");

277:     void *args[] = {(void *)&kernelArgs...};
278:     return cudaLaunchKernel<util::remove_reference_t<FunctionT>>(std::addressof(func), std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
279:   }
280: };
281:   #endif // PetscDefined(HAVE_CUDA)

283:   #if PetscDefined(HAVE_HIP)
284: template <>
285: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
286:   PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::HIP);

288:   // typedefs
289:   using cupmError_t             = hipError_t;
290:   using cupmEvent_t             = hipEvent_t;
291:   using cupmStream_t            = hipStream_t;
292:   using cupmDeviceProp_t        = hipDeviceProp_t;
293:   using cupmMemcpyKind_t        = hipMemcpyKind;
294:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
295:   using cupmPointerAttributes_t = hipPointerAttribute_t;
296:   using cupmMemoryType_t        = enum hipMemoryType;
297:   using cupmDim3                = dim3;
298:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
299:   using cupmHostFn_t    = hipHostFn_t;
300:   using cupmMemPool_t   = hipMemPool_t;
301:   using cupmMemPoolAttr = hipMemPoolAttr;
302:     #else
303:   using cupmHostFn_t    = void (*)(void *);
304:   using cupmMemPool_t   = void *;
305:   using cupmMemPoolAttr = unsigned int;
306:     #endif

308:   // values
309:   static const auto cupmSuccess       = hipSuccess;
310:   static const auto cupmErrorNotReady = hipErrorNotReady;
311:   // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
312:   static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse;
313:   static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess;
314:   // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
315:   static const auto cupmErrorStubLibrary     = hipErrorInsufficientDriver;
316:   static const auto cupmErrorNoDevice        = hipErrorNoDevice;
317:   static const auto cupmStreamDefault        = hipStreamDefault;
318:   static const auto cupmStreamNonBlocking    = hipStreamNonBlocking;
319:   static const auto cupmDeviceMapHost        = hipDeviceMapHost;
320:   static const auto cupmMemcpyHostToDevice   = hipMemcpyHostToDevice;
321:   static const auto cupmMemcpyDeviceToHost   = hipMemcpyDeviceToHost;
322:   static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
323:   static const auto cupmMemcpyHostToHost     = hipMemcpyHostToHost;
324:   static const auto cupmMemcpyDefault        = hipMemcpyDefault;
325:   static const auto cupmMemoryTypeHost       = hipMemoryTypeHost;
326:   static const auto cupmMemoryTypeDevice     = hipMemoryTypeDevice;
327:   // see
328:   // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
329:   static const auto cupmMemoryTypeManaged      = hipMemoryTypeUnified;
330:   static const auto cupmEventDisableTiming     = hipEventDisableTiming;
331:   static const auto cupmHostAllocDefault       = hipHostMallocDefault;
332:   static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined;
333:   static const auto cupmMemPoolAttrReleaseThreshold =
334:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
335:     hipMemPoolAttrReleaseThreshold;
336:     #else
337:     cupmMemPoolAttr{0};
338:     #endif

340:   // error functions
341:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName)
342:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString)
343:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError)

345:   // device management
346:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount)
347:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties)
348:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice)
349:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice)
350:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags)
351:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags)
352:   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes)
353:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
354:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool)
355:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute)
356:     #else
357:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
358:   {
359:     *pool = nullptr;
360:     return cupmSuccess;
361:   }

363:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
364:     #endif
365:   PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit)

367:   // stream management
368:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate)
369:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags)
370:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy)
371:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord)
372:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize)
373:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime)
374:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery)
375:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate)
376:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags)
377:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags)
378:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy)
379:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent)
380:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery)
381:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize)
382:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize)
383:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress)

385:   // memory management
386:   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree)
387:   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc)
388:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
389:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync)
390:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync)
391:     #else
392:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1)
393:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1)
394:     #endif
395:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy)
396:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync)
397:   // hipMallocHost is deprecated
398:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc)
399:   // hipFreeHost is deprecated
400:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree)
401:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset)
402:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync)
403:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, hipMemcpy2D)
404:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, hipMemcpy2DAsync, 1)
405:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, hipMemset2D)
406:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, hipMemset2DAsync, 1)

408:       // launch control
409:       // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
410:       // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
411:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
412:   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc)
413:     #else
414:   PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
415:   {
416:     // the only correct way to spoof this function is to do it synchronously...
417:     auto herr = hipStreamSynchronize(stream);
418:     if (PetscUnlikely(herr != hipSuccess)) return herr;
419:     fn(ctx);
420:     return herr;
421:   }
422:     #endif

424:   template <typename FunctionT, typename... KernelArgsT>
425:   PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
426:   {
427:     void *args[] = {(void *)&kernelArgs...};
428:     return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
429:   }
430: };
431:   #endif // PetscDefined(HAVE_HIP)

433:   // shorthand for bringing all of the typedefs from the base Interface class into your own,
434:   // it's annoying that c++ doesn't have a way to do this automatically
435:   #define PETSC_CUPM_IMPL_CLASS_HEADER(T) \
436:     PETSC_CUPM_BASE_CLASS_HEADER(T); \
437:     /* types */ \
438:     using cupmComplex_t           = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmComplex_t; \
439:     using cupmError_t             = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmError_t; \
440:     using cupmEvent_t             = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEvent_t; \
441:     using cupmStream_t            = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStream_t; \
442:     using cupmDeviceProp_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceProp_t; \
443:     using cupmMemcpyKind_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyKind_t; \
444:     using cupmPointerAttributes_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerAttributes_t; \
445:     using cupmMemoryType_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryType_t; \
446:     using cupmDim3                = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDim3; \
447:     using cupmMemPool_t           = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPool_t; \
448:     using cupmMemPoolAttr         = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttr; \
449:     /* variables */ \
450:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSuccess; \
451:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNotReady; \
452:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorDeviceAlreadyInUse; \
453:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorSetOnActiveProcess; \
454:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorStubLibrary; \
455:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNoDevice; \
456:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDefault; \
457:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamNonBlocking; \
458:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceMapHost; \
459:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToDevice; \
460:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToHost; \
461:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToDevice; \
462:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToHost; \
463:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDefault; \
464:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeHost; \
465:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeDevice; \
466:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeManaged; \
467:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDisableTiming; \
468:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocDefault; \
469:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocWriteCombined; \
470:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttrReleaseThreshold; \
471:     /* functions */ \
472:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorName; \
473:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorString; \
474:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetLastError; \
475:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceCount; \
476:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceProperties; \
477:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDevice; \
478:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDevice; \
479:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceFlags; \
480:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDeviceFlags; \
481:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerGetAttributes; \
482:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetMemPool; \
483:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolSetAttribute; \
484:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmInit; \
485:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreate; \
486:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreateWithFlags; \
487:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDestroy; \
488:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventRecord; \
489:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventSynchronize; \
490:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventElapsedTime; \
491:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventQuery; \
492:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreate; \
493:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreateWithFlags; \
494:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamGetFlags; \
495:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDestroy; \
496:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamWaitEvent; \
497:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamQuery; \
498:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamSynchronize; \
499:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceSynchronize; \
500:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetSymbolAddress; \
501:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMalloc; \
502:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocAsync; \
503:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy; \
504:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyAsync; \
505:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocHost; \
506:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset; \
507:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemsetAsync; \
508:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2D; \
509:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2DAsync; \
510:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2D; \
511:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2DAsync; \
512:     using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmLaunchHostFunc

514: // The actual interface class
515: template <DeviceType T>
516: struct Interface : InterfaceImpl<T> {
517: private:
518:   using interface_type = InterfaceImpl<T>;

520: public:
521:   PETSC_CUPM_IMPL_CLASS_HEADER(T);

523:   using cupmReal_t   = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
524:   using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;

526:   PETSC_NODISCARD static constexpr cupmScalar_t cupmScalarCast(PetscScalar s) noexcept
527:   {
528:   #if PetscDefined(USE_COMPLEX)
529:     return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
530:   #else
531:     return static_cast<cupmScalar_t>(s);
532:   #endif
533:   }

535:   PETSC_NODISCARD static constexpr const cupmScalar_t *cupmScalarPtrCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }

537:   PETSC_NODISCARD static constexpr cupmScalar_t *cupmScalarPtrCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }

539:   PETSC_NODISCARD static constexpr const cupmReal_t *cupmRealPtrCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }

541:   PETSC_NODISCARD static constexpr cupmReal_t *cupmRealPtrCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }

543:   #if !defined(PETSC_PKG_CUDA_VERSION_GE)
544:     #define PETSC_PKG_CUDA_VERSION_GE(...) 0
545:     #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
546:   #endif
547:   static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
548:   {
549:     cupmPointerAttributes_t attr;
550:     cupmError_t             cerr;

552:     PetscFunctionBegin;
554:     if (registered) {
556:       *registered = PETSC_FALSE;
557:     }
558:     if (managed) {
560:       *managed = PETSC_FALSE;
561:     }
562:     // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
563:     // a host pointer returns cudaErrorInvalidValue
564:     cerr = cupmPointerGetAttributes(&attr, data);
565:     cerr = cupmGetLastError();
566:       // HIP seems to always have used memoryType though
567:   #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
568:     const auto mtype = attr.memoryType;
569:     if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
570:   #else
571:     if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) PetscCallCUPM(cerr);
572:     const auto mtype = attr.type;
573:     if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
574:   #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
575:     if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
576:     if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
577:     PetscFunctionReturn(PETSC_SUCCESS);
578:   }
579:   #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
580:     #undef PETSC_PKG_CUDA_VERSION_GE
581:   #endif

583:   PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
584:   {
585:     switch (mode) {
586:     case PETSC_DEVICE_COPY_HTOH:
587:       return cupmMemcpyHostToHost;
588:     case PETSC_DEVICE_COPY_HTOD:
589:       return cupmMemcpyHostToDevice;
590:     case PETSC_DEVICE_COPY_DTOD:
591:       return cupmMemcpyDeviceToDevice;
592:     case PETSC_DEVICE_COPY_DTOH:
593:       return cupmMemcpyDeviceToHost;
594:     case PETSC_DEVICE_COPY_AUTO:
595:       return cupmMemcpyDefault;
596:     }
597:     PetscUnreachable();
598:     return cupmMemcpyDefault;
599:   }

601:   // these change what the arguments mean, so need to namespace these
602:   template <typename M>
603:   static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
604:   {
605:     static_assert(!std::is_void<M>::value, "");

607:     PetscFunctionBegin;
609:     *ptr = nullptr;
610:     if (n) {
611:       const auto bytes = n * sizeof(M);
612:       // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
613:       //
614:       // TLD;DR: cudaMallocAsync() does not work with NVIDIA GPUDirect which OPENMPI uses to
615:       // underpin its cuda-aware MPI implementation, so we cannot just async allocate
616:       // blindly...
617:       if (stream) {
618:         PetscCallCUPM(cupmMallocAsync(reinterpret_cast<void **>(ptr), bytes, stream));
619:       } else {
620:         PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(ptr), bytes));
621:       }
622:     }
623:     PetscFunctionReturn(PETSC_SUCCESS);
624:   }

626:   template <typename M>
627:   static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
628:   {
629:     PetscFunctionBegin;
630:     PetscCall(PetscCUPMMallocAsync(ptr, n));
631:     PetscFunctionReturn(PETSC_SUCCESS);
632:   }

634:   template <typename M>
635:   static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
636:   {
637:     static_assert(!std::is_void<M>::value, "");

639:     PetscFunctionBegin;
641:     *ptr = nullptr;
642:     if (n) PetscCallCUPM(cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags));
643:     PetscFunctionReturn(PETSC_SUCCESS);
644:   }

646:   template <typename D>
647:   static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
648:   {
649:     static_assert(!std::is_void<D>::value, "");
650:     const auto size = n * sizeof(D);

652:     PetscFunctionBegin;
653:     if (PetscUnlikely(!n)) PetscFunctionReturn(PETSC_SUCCESS);
655:     PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
656:     PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
657:     // do early return after nullptr check since we need to check that they are not both nullptrs
658:     if (PetscUnlikely(dest == src)) PetscFunctionReturn(PETSC_SUCCESS);
659:     if (kind == cupmMemcpyHostToHost) {
660:       // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
661:       // than it is to just call the vendor functions. This assumes of course that the stream
662:       // accounts for both memory regions being "idle"
663:       if (cupmStreamQuery(stream) == cupmSuccess) {
664:         PetscCall(PetscMemcpy(dest, src, size));
665:         PetscFunctionReturn(PETSC_SUCCESS);
666:       }
667:       // need to clear the potential cupmErrorNotReady generated by query above...
668:       auto cerr = cupmGetLastError();

670:       if (PetscUnlikely(cerr != cupmErrorNotReady)) PetscCallCUPM(cerr);
671:     }
672:     if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
673:       PetscCallCUPM(cupmMemcpyAsync(dest, src, size, kind, stream));
674:     } else {
675:       PetscCallCUPM(cupmMemcpy(dest, src, size, kind));
676:     }
677:     PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
678:     PetscFunctionReturn(PETSC_SUCCESS);
679:   }

681:   template <typename D>
682:   static PetscErrorCode PetscCUPMMemcpy(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
683:   {
684:     PetscFunctionBegin;
685:     PetscCall(PetscCUPMMemcpyAsync(dest, src, n, kind));
686:     PetscFunctionReturn(PETSC_SUCCESS);
687:   }

689:   template <typename D>
690:   static PetscErrorCode PetscCUPMMemcpy2DAsync(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr)
691:   {
692:     static_assert(!std::is_void<D>::value, "");
693:     const auto dest_pitch_bytes = dest_pitch * sizeof(D);
694:     const auto src_pitch_bytes  = src_pitch * sizeof(D);
695:     const auto width_bytes      = width * sizeof(D);
696:     const auto size             = height * width_bytes;

698:     PetscFunctionBegin;
699:     if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
700:     PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
701:     PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
702:     if (stream || (kind != cupmMemcpyDeviceToHost)) {
703:       PetscCallCUPM(cupmMemcpy2DAsync(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind, stream));
704:     } else {
705:       PetscCallCUPM(cupmMemcpy2D(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind));
706:     }
707:     PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
708:     PetscFunctionReturn(PETSC_SUCCESS);
709:   }

711:   template <typename D>
712:   static PetscErrorCode PetscCUPMMemcpy2D(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind)
713:   {
714:     PetscFunctionBegin;
715:     PetscCall(PetscCUPMMemcpy2DAsync(dest, dest_pitch, src, src_pitch, width, height, kind));
716:     PetscFunctionReturn(PETSC_SUCCESS);
717:   }

719:   template <typename M>
720:   static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
721:   {
722:     static_assert(!std::is_void<M>::value, "");

724:     PetscFunctionBegin;
725:     if (PetscLikely(n)) {
726:       const auto bytes = n * sizeof(M);

728:       PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", n);
729:       if (stream || use_async) {
730:         PetscCallCUPM(cupmMemsetAsync(ptr, value, bytes, stream));
731:       } else {
732:         PetscCallCUPM(cupmMemset(ptr, value, bytes));
733:       }
734:     }
735:     PetscFunctionReturn(PETSC_SUCCESS);
736:   }

738:   template <typename M>
739:   static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
740:   {
741:     PetscFunctionBegin;
742:     PetscCall(PetscCUPMMemsetAsync(ptr, value, n));
743:     PetscFunctionReturn(PETSC_SUCCESS);
744:   }

746:   template <typename D>
747:   static PetscErrorCode PetscCUPMMemset2DAsync(D *ptr, std::size_t pitch, int value, std::size_t width, std::size_t height, cupmStream_t stream = nullptr)
748:   {
749:     static_assert(!std::is_void<D>::value, "");
750:     const auto pitch_bytes = pitch * sizeof(D);
751:     const auto width_bytes = width * sizeof(D);
752:     const auto size        = width_bytes * height;

754:     PetscFunctionBegin;
755:     if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
756:     PetscAssert(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", size);
757:     if (stream) {
758:       PetscCallCUPM(cupmMemset2DAsync(ptr, pitch_bytes, value, width_bytes, height, stream));
759:     } else {
760:       PetscCallCUPM(cupmMemset2D(ptr, pitch_bytes, value, width_bytes, height));
761:     }
762:     PetscFunctionReturn(PETSC_SUCCESS);
763:   }

765:   // these we can transparently wrap, no need to namespace it to Petsc
766:   template <typename M>
767:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &ptr, cupmStream_t stream = nullptr) noexcept
768:   {
769:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
770:     static_assert(!std::is_const<M>::value, "");

772:     if (ptr) {
773:       auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);

775:       ptr = nullptr;
776:       if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
777:     }
778:     return cupmSuccess;
779:   }

781:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) { return interface_type::cupmFreeAsync(ptr, stream); }

783:   template <typename M>
784:   PETSC_NODISCARD static cupmError_t cupmFree(M &ptr) noexcept
785:   {
786:     return cupmFreeAsync(ptr);
787:   }

789:   PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) { return cupmFreeAsync(ptr); }

791:   template <typename M>
792:   PETSC_NODISCARD static cupmError_t cupmFreeHost(M &ptr) noexcept
793:   {
794:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
795:     const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
796:     ptr             = nullptr;
797:     return cerr;
798:   }

800:   PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) { return interface_type::cupmFreeHost(ptr); }

802:   // specific wrapper for device launch function, as the real function is a C routine and
803:   // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
804:   // boils down to the fact that ultimately we pass a
805:   //
806:   // void *args[] = {(void*)&kernel_args...};
807:   //
808:   // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
809:   // happen to the kernel arguments so we must do it ourselves here. This function does this in
810:   // 3 stages:
811:   // 1. Enumerate the kernel arguments (cupmLaunchKernel)
812:   // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
813:   //    expected by func() using the enumeration above (deduceKernelCall)
814:   // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
815:   //    it. (interface_type::cupmLaunchKernel)
816:   template <typename F, typename... Args>
817:   PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
818:   {
819:     return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
820:   }

822:   template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
823:   static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
824:   {
825:     static_assert(block_size > 0, "");
826:     static_assert(warp_size > 0, "");
827:     // want block_size to be a multiple of the warp_size
828:     static_assert(block_size % warp_size == 0, "");
829:     const auto nthread = std::min(n, block_size);
830:     const auto nblock  = (n + block_size - 1) / block_size;

832:     PetscFunctionBegin;
833:     // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
834:     // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
835:     PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
836:     PetscCallCUPM(cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...));
837:     PetscFunctionReturn(PETSC_SUCCESS);
838:   }

840: private:
841:   template <typename S, typename D, typename = void>
842:   struct is_static_castable : std::false_type { };

844:   template <typename S, typename D>
845:   struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };

847:   template <typename D, typename S>
848:   static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
849:   {
850:     return static_cast<D>(std::forward<S>(src));
851:   }

853:   template <typename D, typename S>
854:   static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
855:   {
856:     return const_cast<D>(std::forward<S>(src));
857:   }

859:   template <typename F, typename... Args, std::size_t... Idx>
860:   PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
861:   {
862:     // clang-format off
863:     return interface_type::template cupmLaunchKernel(
864:       std::forward<F>(func),
865:       std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
866:       // can't static_cast() here since the function argument type may be cv-qualified, in
867:       // which case we would need to const_cast(). But you can only const_cast() indirect types
868:       // (pointers, references). So we need a SFINAE monster that is a static_cast() if
869:       // possible, and a const_cast() if not. We could just use a C-style cast which *would*
870:       // work here since it tries the following and uses the first one that succeeds:
871:       //
872:       // 1. const_cast()
873:       // 2. static_cast()
874:       // 3. static_cast() then const_cast()
875:       // 4. reinterpret_cast()...
876:       //
877:       // the issue however is the final reinterpret_cast(). We absolutely cannot get there
878:       // because doing so would silently hide a ton of bugs, for example casting a PetscScalar
879:       // * to double * in complex builds, a PetscInt * to int * in 64idx builds, etc.
880:       cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
881:     );
882:     // clang-format on
883:   }

885:   static PetscErrorCode PetscLogCUPMMemcpyTransfer(cupmMemcpyKind_t kind, std::size_t size) noexcept
886:   {
887:     PetscFunctionBegin;
888:     // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
889:     // (yet) or don't know the direction
890:     if (kind == cupmMemcpyDeviceToHost) PetscCall(PetscLogGpuToCpu(static_cast<PetscLogDouble>(size)));
891:     else if (kind == cupmMemcpyHostToDevice) PetscCall(PetscLogCpuToGpu(static_cast<PetscLogDouble>(size)));
892:     else (void)size;
893:     PetscFunctionReturn(PETSC_SUCCESS);
894:   }
895: };

897:   #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(T) \
898:     PETSC_CUPM_IMPL_CLASS_HEADER(T); \
899:     using cupmReal_t   = typename ::Petsc::device::cupm::impl::Interface<T>::cupmReal_t; \
900:     using cupmScalar_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmScalar_t; \
901:     using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarCast; \
902:     using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarPtrCast; \
903:     using ::Petsc::device::cupm::impl::Interface<T>::cupmRealPtrCast; \
904:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMGetMemType; \
905:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset; \
906:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemsetAsync; \
907:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMalloc; \
908:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocAsync; \
909:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocHost; \
910:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy; \
911:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpyAsync; \
912:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2D; \
913:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2DAsync; \
914:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset2DAsync; \
915:     using ::Petsc::device::cupm::impl::Interface<T>::cupmFree; \
916:     using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeAsync; \
917:     using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeHost; \
918:     using ::Petsc::device::cupm::impl::Interface<T>::cupmLaunchKernel; \
919:     using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMLaunchKernel1D; \
920:     using ::Petsc::device::cupm::impl::Interface<T>::PetscDeviceCopyModeToCUPMMemcpyKind

922: } // namespace impl

924: } // namespace cupm

926: } // namespace device

928: } // namespace Petsc

930: #endif /* __cplusplus */

932: #endif /* PETSCCUPMINTERFACE_HPP */