Actual source code: veccupmimpl.h

  1: #ifndef PETSCVECCUPMIMPL_H
  2: #define PETSCVECCUPMIMPL_H

  4: #include <petsc/private/vecimpl.h>
  5: #include <../src/vec/vec/impls/dvecimpl.h>

  7: #if PetscDefined(HAVE_NVSHMEM)
  8: PETSC_INTERN PetscErrorCode PetscNvshmemInitializeCheck(void);
  9: PETSC_INTERN PetscErrorCode PetscNvshmemMalloc(size_t, void **);
 10: PETSC_INTERN PetscErrorCode PetscNvshmemCalloc(size_t, void **);
 11: PETSC_INTERN PetscErrorCode PetscNvshmemFree_Private(void *);
 12:   #define PetscNvshmemFree(ptr) ((PetscErrorCode)((ptr) && (PetscNvshmemFree_Private(ptr) || ((ptr) = PETSC_NULLPTR, PETSC_SUCCESS))))
 13: PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt, PetscScalar *, const PetscScalar *);
 14: PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt, PetscReal *, const PetscReal *);
 15: PETSC_INTERN PetscErrorCode VecNormAsync_NVSHMEM(Vec, NormType, PetscReal *);
 16: PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec);
 17: #else
 18:   #define PetscNvshmemFree(ptr) PETSC_SUCCESS
 19: #endif

 21: #if defined(__cplusplus) && PetscDefined(HAVE_DEVICE)
 22: #include <petsc/private/deviceimpl.h>
 23: #include <petsc/private/cupmobject.hpp>
 24: #include <petsc/private/cupmblasinterface.hpp>

 26: #include <petsc/private/cpp/functional.hpp>

 28:   #include <limits> // std::numeric_limits

 30: namespace Petsc
 31: {

 33: namespace vec
 34: {

 36: namespace cupm
 37: {

 39: namespace impl
 40: {

 42: namespace
 43: {

 45: struct no_op {
 46:   template <typename... T>
 47:   constexpr PetscErrorCode operator()(T &&...) const noexcept
 48:   {
 49:     return PETSC_SUCCESS;
 50:   }
 51: };

 53: template <typename T>
 54: struct CooPair {
 55:   using value_type = T;
 56:   using size_type  = PetscCount;

 58:   value_type *&device;
 59:   value_type *&host;
 60:   size_type    size;
 61: };

 63: template <typename U>
 64: static constexpr CooPair<U> make_coo_pair(U *&device, U *&host, PetscCount size) noexcept
 65: {
 66:   return {device, host, size};
 67: }

 69: } // anonymous namespace

 71: // forward declarations
 72: template <device::cupm::DeviceType>
 73: class VecSeq_CUPM;
 74: template <device::cupm::DeviceType>
 75: class VecMPI_CUPM;

 77: // ==========================================================================================
 78: // Vec_CUPMBase
 79: //
 80: // Base class for the VecSeq and VecMPI CUPM implementations. On top of the usual DeviceType
 81: // template parameter it also uses CRTP to be able to use values/calls specific to either
 82: // VecSeq or VecMPI. This is in effect "inside-out" polymorphism.
 83: // ==========================================================================================
 84: template <device::cupm::DeviceType T, typename Derived>
 85: class Vec_CUPMBase : protected device::cupm::impl::CUPMObject<T> {
 86: public:
 87:   PETSC_CUPMOBJECT_HEADER(T);

 89:   // ==========================================================================================
 90:   // Vec_CUPMBase::VectorArray
 91:   //
 92:   // RAII versions of the get/restore array routines. Determines constness of the pointer type,
 93:   // holds the pointer itself provides the implicit conversion operator
 94:   // ==========================================================================================
 95:   template <PetscMemType, PetscMemoryAccessMode>
 96:   class VectorArray;

 98: protected:
 99:   static PetscErrorCode VecView_Debug(Vec v, const char *message = "") noexcept
100:   {
101:     const auto   pobj  = PetscObjectCast(v);
102:     const auto   vimpl = VecIMPLCast(v);
103:     const auto   vcu   = VecCUPMCast(v);
104:     PetscMemType mtype;
105:     MPI_Comm     comm;

107:     PetscFunctionBegin;
110:     PetscCall(PetscObjectGetComm(pobj, &comm));
111:     PetscCall(PetscPrintf(comm, "---------- %s ----------\n", message));
112:     PetscCall(PetscObjectPrintClassNamePrefixType(pobj, PETSC_VIEWER_STDOUT_(comm)));
113:     PetscCall(PetscPrintf(comm, "Address:             %p\n", v));
114:     PetscCall(PetscPrintf(comm, "Size:                %" PetscInt_FMT "\n", v->map->n));
115:     PetscCall(PetscPrintf(comm, "Offload mask:        %s\n", PetscOffloadMaskToString(v->offloadmask)));
116:     PetscCall(PetscPrintf(comm, "Host ptr:            %p\n", vimpl->array));
117:     PetscCall(PetscPrintf(comm, "Device ptr:          %p\n", vcu->array_d));
118:     PetscCall(PetscPrintf(comm, "Device alloced ptr:  %p\n", vcu->array_allocated_d));
119:     PetscCall(PetscCUPMGetMemType(vcu->array_d, &mtype));
120:     PetscCall(PetscPrintf(comm, "dptr is device mem?  %s\n", PetscBools[static_cast<PetscBool>(PetscMemTypeDevice(mtype))]));
121:     PetscFunctionReturn(PETSC_SUCCESS);
122:   }

124:   // Delete the allocated device array if required and replace it with the given array
125:   static PetscErrorCode ResetAllocatedDevicePtr_(PetscDeviceContext, Vec, PetscScalar * = nullptr) noexcept;
126:   // Check either the host or device impl pointer is allocated and allocate it if
127:   // isn't. CastFunctionType casts the Vec to the required type and returns the pointer
128:   template <typename CastFunctionType>
129:   static PetscErrorCode VecAllocateCheck_(Vec, void *&, CastFunctionType &&) noexcept;
130:   // Check the CUPM part (v->spptr) is allocated, otherwise allocate it
131:   static PetscErrorCode VecCUPMAllocateCheck_(Vec) noexcept;
132:   // Check the Host part (v->data) is allocated, otherwise allocate it
133:   static PetscErrorCode VecIMPLAllocateCheck_(Vec) noexcept;
134:   // Check the Host array is allocated, otherwise allocate it
135:   static PetscErrorCode HostAllocateCheck_(PetscDeviceContext, Vec) noexcept;
136:   // Check the CUPM array is allocated, otherwise allocate it
137:   static PetscErrorCode DeviceAllocateCheck_(PetscDeviceContext, Vec) noexcept;
138:   // Copy HTOD, allocating device if necessary
139:   static PetscErrorCode CopyToDevice_(PetscDeviceContext, Vec, bool = false) noexcept;
140:   // Copy DTOH, allocating host if necessary
141:   static PetscErrorCode CopyToHost_(PetscDeviceContext, Vec, bool = false) noexcept;

143: public:
144:   struct Vec_CUPM {
145:     PetscScalar *array_d;           // gpu data
146:     PetscScalar *array_allocated_d; // does PETSc own the array ptr?
147:     PetscBool    nvshmem;           // is array allocated in nvshmem? It is used to allocate
148:                                     // Mvctx->lvec in nvshmem

150:     // COO stuff
151:     PetscCount *jmap1_d; // [m+1]: i-th entry of the vector has jmap1[i+1]-jmap1[i] repeats
152:                          // in COO arrays
153:     PetscCount *perm1_d; // [tot1]: permutation array for local entries
154:     PetscCount *imap2_d; // [nnz2]: i-th unique entry in recvbuf is imap2[i]-th entry in
155:                          // the vector
156:     PetscCount *jmap2_d; // [nnz2+1]
157:     PetscCount *perm2_d; // [recvlen]
158:     PetscCount *Cperm_d; // [sendlen]: permutation array to fill sendbuf[]. 'C' for
159:                          // communication

161:     // Buffers for remote values in VecSetValuesCOO()
162:     PetscScalar *sendbuf_d;
163:     PetscScalar *recvbuf_d;
164:   };

166:   // Cast the Vec to its Vec_CUPM struct, i.e. return the result of (Vec_CUPM *)v->spptr
167:   PETSC_NODISCARD static Vec_CUPM *VecCUPMCast(Vec) noexcept;
168:   // Cast the Vec to its host struct, i.e. return the result of (Vec_Seq *)v->data
169:   template <typename U = Derived>
170:   PETSC_NODISCARD static constexpr auto VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v));
171:   // Get the PetscLogEvents for HTOD and DTOH
172:   PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyToGPU() noexcept;
173:   PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyFromGPU() noexcept;
174:   // Get the VecTypes
175:   PETSC_NODISCARD static constexpr VecType VECSEQCUPM() noexcept;
176:   PETSC_NODISCARD static constexpr VecType VECMPICUPM() noexcept;
177:   PETSC_NODISCARD static constexpr VecType VECCUPM() noexcept;

179:   // Get the VecType of the calling vector
180:   template <typename U = Derived>
181:   PETSC_NODISCARD static constexpr VecType VECIMPLCUPM() noexcept;

183:   // Call the host destroy function, i.e. VecDestroy_Seq()
184:   static PetscErrorCode VecDestroy_IMPL(Vec) noexcept;
185:   // Call the host reset function, i.e. VecResetArray_Seq()
186:   static PetscErrorCode VecResetArray_IMPL(Vec) noexcept;
187:   // ... you get the idea
188:   static PetscErrorCode VecPlaceArray_IMPL(Vec, const PetscScalar *) noexcept;
189:   // Call the host creation function, i.e. VecCreate_Seq(), and also initialize the CUPM part
190:   // along with it if needed
191:   static PetscErrorCode VecCreate_IMPL_Private(Vec, PetscBool *, PetscInt = 0, PetscScalar * = nullptr) noexcept;

193:   // Shorthand for creating VectorArray's. Need functions to create them, otherwise using them
194:   // as an unnamed temporary leads to most vexing parse
195:   PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, v});
196:   PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
197:   PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
198:   PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, v});
199:   PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
200:   PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});

202:   // ops-table functions
203:   static PetscErrorCode Create(Vec) noexcept;
204:   static PetscErrorCode Destroy(Vec) noexcept;
205:   template <PetscMemType, PetscMemoryAccessMode, bool = false>
206:   static PetscErrorCode GetArray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
207:   template <PetscMemType, PetscMemoryAccessMode, bool = false>
208:   static PetscErrorCode GetArray(Vec, PetscScalar **) noexcept;
209:   template <PetscMemType, PetscMemoryAccessMode>
210:   static PetscErrorCode RestoreArray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
211:   template <PetscMemType, PetscMemoryAccessMode>
212:   static PetscErrorCode RestoreArray(Vec, PetscScalar **) noexcept;
213:   template <PetscMemoryAccessMode>
214:   static PetscErrorCode GetArrayAndMemtype(Vec, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
215:   template <PetscMemoryAccessMode>
216:   static PetscErrorCode GetArrayAndMemtype(Vec, PetscScalar **, PetscMemType *) noexcept;
217:   template <PetscMemoryAccessMode>
218:   static PetscErrorCode RestoreArrayAndMemtype(Vec, PetscScalar **, PetscDeviceContext) noexcept;
219:   template <PetscMemoryAccessMode>
220:   static PetscErrorCode RestoreArrayAndMemtype(Vec, PetscScalar **) noexcept;
221:   template <PetscMemType>
222:   static PetscErrorCode ReplaceArray(Vec, const PetscScalar *) noexcept;
223:   template <PetscMemType>
224:   static PetscErrorCode ResetArray(Vec) noexcept;
225:   template <PetscMemType>
226:   static PetscErrorCode PlaceArray(Vec, const PetscScalar *) noexcept;

228:   // common ops shared between Seq and MPI
229:   static PetscErrorCode Create_CUPM(Vec) noexcept;
230:   static PetscErrorCode Create_CUPMBase(MPI_Comm, PetscInt, PetscInt, PetscInt, Vec *, PetscBool, PetscLayout /*reference*/ = nullptr) noexcept;
231:   static PetscErrorCode Initialize_CUPMBase(Vec, PetscBool, PetscScalar *, PetscScalar *, PetscDeviceContext) noexcept;
232:   template <typename SetupFunctionT = no_op>
233:   static PetscErrorCode Duplicate_CUPMBase(Vec, Vec *, PetscDeviceContext, SetupFunctionT && = SetupFunctionT{}) noexcept;
234:   static PetscErrorCode BindToCPU_CUPMBase(Vec, PetscBool, PetscDeviceContext) noexcept;
235:   static PetscErrorCode GetArrays_CUPMBase(Vec, const PetscScalar **, const PetscScalar **, PetscOffloadMask *, PetscDeviceContext) noexcept;
236:   static PetscErrorCode ResetPreallocationCOO_CUPMBase(Vec, PetscDeviceContext) noexcept;
237:   template <std::size_t NCount = 0, std::size_t NScal = 0>
238:   static PetscErrorCode SetPreallocationCOO_CUPMBase(Vec, PetscCount, const PetscInt[], PetscDeviceContext, const std::array<CooPair<PetscCount>, NCount> & = {}, const std::array<CooPair<PetscScalar>, NScal> & = {}) noexcept;
239: };

241: // ==========================================================================================
242: // Vec_CUPMBase::VectorArray
243: //
244: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
245: // holds the pointer itself and provides the implicit conversion operator.
246: //
247: // On construction this calls the moral equivalent of Vec[CUPM]GetArray[Read|Write]()
248: // (depending on PetscMemoryAccessMode) and on destruction automatically restores the array
249: // for you
250: // ==========================================================================================
251: template <device::cupm::DeviceType T, typename D>
252: template <PetscMemType MT, PetscMemoryAccessMode MA>
253: class Vec_CUPMBase<T, D>::VectorArray : public device::cupm::impl::RestoreableArray<T, MT, MA> {
254:   using base_type = device::cupm::impl::RestoreableArray<T, MT, MA>;

256: public:
257:   VectorArray(PetscDeviceContext, Vec) noexcept;
258:   ~VectorArray() noexcept;

260: private:
261:   Vec v_ = nullptr;
262: };

264: // ==========================================================================================
265: // Vec_CUPMBase::VectorArray - Public API
266: // ==========================================================================================

268: template <device::cupm::DeviceType T, typename D>
269: template <PetscMemType MT, PetscMemoryAccessMode MA>
270: inline Vec_CUPMBase<T, D>::VectorArray<MT, MA>::VectorArray(PetscDeviceContext dctx, Vec v) noexcept : base_type{dctx}, v_{v}
271: {
272:   PetscFunctionBegin;
273:   PetscCallAbort(PETSC_COMM_SELF, Vec_CUPMBase<T, D>::template GetArray<MT, MA, true>(v, &this->ptr_, dctx));
274:   PetscFunctionReturnVoid();
275: }

277: template <device::cupm::DeviceType T, typename D>
278: template <PetscMemType MT, PetscMemoryAccessMode MA>
279: inline Vec_CUPMBase<T, D>::VectorArray<MT, MA>::~VectorArray() noexcept
280: {
281:   PetscFunctionBegin;
282:   PetscCallAbort(PETSC_COMM_SELF, Vec_CUPMBase<T, D>::template RestoreArray<MT, MA>(v_, &this->ptr_, this->dctx_));
283:   PetscFunctionReturnVoid();
284: }

286: // ==========================================================================================
287: // Vec_CUPMBase - Protected API
288: // ==========================================================================================

290: template <device::cupm::DeviceType T, typename D>
291: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetAllocatedDevicePtr_(PetscDeviceContext dctx, Vec v, PetscScalar *new_value) noexcept
292: {
293:   auto &device_array = VecCUPMCast(v)->array_allocated_d;

295:   PetscFunctionBegin;
296:   if (device_array) {
297:     if (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) {
298:       PetscCall(PetscNvshmemFree(device_array));
299:     } else {
300:       cupmStream_t stream;

302:       PetscCall(GetHandlesFrom_(dctx, &stream));
303:       PetscCallCUPM(cupmFreeAsync(device_array, stream));
304:     }
305:   }
306:   device_array = new_value;
307:   PetscFunctionReturn(PETSC_SUCCESS);
308: }

310: namespace
311: {

313: inline PetscErrorCode VecCUPMCheckMinimumPinnedMemory_Internal(Vec v) noexcept
314: {
315:   auto      mem = static_cast<PetscInt>(v->minimum_bytes_pinned_memory);
316:   PetscBool flg;

318:   PetscFunctionBegin;
319:   PetscObjectOptionsBegin(PetscObjectCast(v));
320:   PetscCall(PetscOptionsRangeInt("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", mem, &mem, &flg, 0, std::numeric_limits<decltype(mem)>::max()));
321:   if (flg) v->minimum_bytes_pinned_memory = mem;
322:   PetscOptionsEnd();
323:   PetscFunctionReturn(PETSC_SUCCESS);
324: }

326: } // anonymous namespace

328: template <device::cupm::DeviceType T, typename D>
329: template <typename CastFunctionType>
330: inline PetscErrorCode Vec_CUPMBase<T, D>::VecAllocateCheck_(Vec v, void *&dest, CastFunctionType &&cast) noexcept
331: {
332:   PetscFunctionBegin;
333:   if (PetscLikely(dest)) PetscFunctionReturn(PETSC_SUCCESS);
334:   // do the check here so we don't have to do it in every function
335:   PetscCall(checkCupmBlasIntCast(v->map->n));
336:   {
337:     auto impl = cast(v);

339:     PetscCall(PetscNew(&impl));
340:     dest = impl;
341:   }
342:   PetscFunctionReturn(PETSC_SUCCESS);
343: }

345: template <device::cupm::DeviceType T, typename D>
346: inline PetscErrorCode Vec_CUPMBase<T, D>::VecIMPLAllocateCheck_(Vec v) noexcept
347: {
348:   PetscFunctionBegin;
349:   PetscCall(VecAllocateCheck_(v, v->data, VecIMPLCast<D>));
350:   PetscFunctionReturn(PETSC_SUCCESS);
351: }

353: // allocate the Vec_CUPM struct. this is normally done through DeviceAllocateCheck_(), but in
354: // certain circumstances (such as when the user places the device array) we do not want to do
355: // the full DeviceAllocateCheck_() as it also allocates the array
356: template <device::cupm::DeviceType T, typename D>
357: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCUPMAllocateCheck_(Vec v) noexcept
358: {
359:   PetscFunctionBegin;
360:   PetscCall(VecAllocateCheck_(v, v->spptr, VecCUPMCast));
361:   PetscFunctionReturn(PETSC_SUCCESS);
362: }

364: template <device::cupm::DeviceType T, typename D>
365: inline PetscErrorCode Vec_CUPMBase<T, D>::HostAllocateCheck_(PetscDeviceContext, Vec v) noexcept
366: {
367:   PetscFunctionBegin;
368:   PetscCall(VecIMPLAllocateCheck_(v));
369:   if (auto &alloc = VecIMPLCast(v)->array_allocated) PetscFunctionReturn(PETSC_SUCCESS);
370:   else {
371:     PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v));
372:     {
373:       const auto n     = v->map->n;
374:       const auto useit = UseCUPMHostAlloc((n * sizeof(*alloc)) > v->minimum_bytes_pinned_memory);

376:       v->pinned_memory = static_cast<decltype(v->pinned_memory)>(useit.value());
377:       PetscCall(PetscMalloc1(n, &alloc));
378:     }
379:     if (!VecIMPLCast(v)->array) VecIMPLCast(v)->array = alloc;
380:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
381:   }
382:   PetscFunctionReturn(PETSC_SUCCESS);
383: }

385: template <device::cupm::DeviceType T, typename D>
386: inline PetscErrorCode Vec_CUPMBase<T, D>::DeviceAllocateCheck_(PetscDeviceContext dctx, Vec v) noexcept
387: {
388:   PetscFunctionBegin;
389:   PetscCall(VecCUPMAllocateCheck_(v));
390:   if (auto &alloc = VecCUPMCast(v)->array_d) PetscFunctionReturn(PETSC_SUCCESS);
391:   else {
392:     const auto   n                 = v->map->n;
393:     auto        &array_allocated_d = VecCUPMCast(v)->array_allocated_d;
394:     cupmStream_t stream;

396:     PetscCall(GetHandlesFrom_(dctx, &stream));
397:     PetscCall(PetscCUPMMallocAsync(&array_allocated_d, n, stream));
398:     alloc = array_allocated_d;
399:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
400:       const auto vimp = VecIMPLCast(v);
401:       v->offloadmask  = (vimp && vimp->array) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
402:     }
403:   }
404:   PetscFunctionReturn(PETSC_SUCCESS);
405: }

407: template <device::cupm::DeviceType T, typename D>
408: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToDevice_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
409: {
410:   PetscFunctionBegin;
411:   PetscCall(DeviceAllocateCheck_(dctx, v));
412:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
413:     cupmStream_t stream;

415:     v->offloadmask = PETSC_OFFLOAD_BOTH;
416:     PetscCall(GetHandlesFrom_(dctx, &stream));
417:     PetscCall(PetscLogEventBegin(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
418:     PetscCall(PetscCUPMMemcpyAsync(VecCUPMCast(v)->array_d, VecIMPLCast(v)->array, v->map->n, cupmMemcpyHostToDevice, stream, forceasync));
419:     PetscCall(PetscLogEventEnd(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
420:   }
421:   PetscFunctionReturn(PETSC_SUCCESS);
422: }

424: template <device::cupm::DeviceType T, typename D>
425: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToHost_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
426: {
427:   PetscFunctionBegin;
428:   PetscCall(HostAllocateCheck_(dctx, v));
429:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
430:     cupmStream_t stream;

432:     v->offloadmask = PETSC_OFFLOAD_BOTH;
433:     PetscCall(GetHandlesFrom_(dctx, &stream));
434:     PetscCall(PetscLogEventBegin(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
435:     PetscCall(PetscCUPMMemcpyAsync(VecIMPLCast(v)->array, VecCUPMCast(v)->array_d, v->map->n, cupmMemcpyDeviceToHost, stream, forceasync));
436:     PetscCall(PetscLogEventEnd(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
437:   }
438:   PetscFunctionReturn(PETSC_SUCCESS);
439: }

441: // ==========================================================================================
442: // Vec_CUPMBase - Public API
443: // ==========================================================================================

445: template <device::cupm::DeviceType T, typename D>
446: inline typename Vec_CUPMBase<T, D>::Vec_CUPM *Vec_CUPMBase<T, D>::VecCUPMCast(Vec v) noexcept
447: {
448:   return static_cast<Vec_CUPM *>(v->spptr);
449: }

451: // This is a trick to get around the fact that in CRTP the derived class is not yet fully
452: // defined because Base<Derived> must necessarily be instantiated before Derived is
453: // complete. By using a dummy template parameter we make the type "dependent" and so will
454: // only be determined when the derived class is instantiated (and therefore fully defined)
455: template <device::cupm::DeviceType T, typename D>
456: template <typename U>
457: inline constexpr auto Vec_CUPMBase<T, D>::VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v))
458: {
459:   return U::VecIMPLCast_(v);
460: }

462: template <device::cupm::DeviceType T, typename D>
463: inline PetscErrorCode Vec_CUPMBase<T, D>::VecDestroy_IMPL(Vec v) noexcept
464: {
465:   return D::VecDestroy_IMPL_(v);
466: }

468: template <device::cupm::DeviceType T, typename D>
469: inline PetscErrorCode Vec_CUPMBase<T, D>::VecResetArray_IMPL(Vec v) noexcept
470: {
471:   return D::VecResetArray_IMPL_(v);
472: }

474: template <device::cupm::DeviceType T, typename D>
475: inline PetscErrorCode Vec_CUPMBase<T, D>::VecPlaceArray_IMPL(Vec v, const PetscScalar *a) noexcept
476: {
477:   return D::VecPlaceArray_IMPL_(v, a);
478: }

480: template <device::cupm::DeviceType T, typename D>
481: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCreate_IMPL_Private(Vec v, PetscBool *alloc_missing, PetscInt nghost, PetscScalar *host_array) noexcept
482: {
483:   return D::VecCreate_IMPL_Private_(v, alloc_missing, nghost, host_array);
484: }

486: template <device::cupm::DeviceType T, typename D>
487: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyToGPU() noexcept
488: {
489:   return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyToGPU : VEC_HIPCopyToGPU;
490: }

492: template <device::cupm::DeviceType T, typename D>
493: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyFromGPU() noexcept
494: {
495:   return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyFromGPU : VEC_HIPCopyFromGPU;
496: }

498: template <device::cupm::DeviceType T, typename D>
499: inline constexpr VecType Vec_CUPMBase<T, D>::VECSEQCUPM() noexcept
500: {
501:   return T == device::cupm::DeviceType::CUDA ? VECSEQCUDA : VECSEQHIP;
502: }

504: template <device::cupm::DeviceType T, typename D>
505: inline constexpr VecType Vec_CUPMBase<T, D>::VECMPICUPM() noexcept
506: {
507:   return T == device::cupm::DeviceType::CUDA ? VECMPICUDA : VECMPIHIP;
508: }

510: template <device::cupm::DeviceType T, typename D>
511: inline constexpr VecType Vec_CUPMBase<T, D>::VECCUPM() noexcept
512: {
513:   return T == device::cupm::DeviceType::CUDA ? VECCUDA : VECHIP;
514: }

516: template <device::cupm::DeviceType T, typename D>
517: template <typename U>
518: inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPLCUPM() noexcept
519: {
520:   return U::VECIMPLCUPM_();
521: }

523: // private version that takes a PetscDeviceContext, called by the public variant
524: template <device::cupm::DeviceType T, typename D>
525: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
526: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArray(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
527: {
528:   constexpr auto hostmem     = PetscMemTypeHost(mtype);
529:   const auto     oldmask     = v->offloadmask;
530:   auto          &mask        = v->offloadmask;
531:   auto           should_sync = false;

533:   PetscFunctionBegin;
534:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
535:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
536:   if (PetscMemoryAccessRead(access)) {
537:     // READ or READ_WRITE
538:     if (((oldmask == PETSC_OFFLOAD_GPU) && hostmem) || ((oldmask == PETSC_OFFLOAD_CPU) && !hostmem)) {
539:       // if we move the data we should set the flag to synchronize later on
540:       should_sync = true;
541:     }
542:     PetscCall((hostmem ? CopyToHost_ : CopyToDevice_)(dctx, v, force));
543:   } else {
544:     // WRITE only
545:     PetscCall((hostmem ? HostAllocateCheck_ : DeviceAllocateCheck_)(dctx, v));
546:   }
547:   *a = hostmem ? VecIMPLCast(v)->array : VecCUPMCast(v)->array_d;
548:   // if unallocated previously we should zero things out if we intend to read
549:   if (PetscMemoryAccessRead(access) && (oldmask == PETSC_OFFLOAD_UNALLOCATED)) {
550:     const auto n = v->map->n;

552:     if (hostmem) {
553:       PetscCall(PetscArrayzero(*a, n));
554:     } else {
555:       cupmStream_t stream;

557:       PetscCall(GetHandlesFrom_(dctx, &stream));
558:       PetscCall(PetscCUPMMemsetAsync(*a, 0, n, stream, force));
559:       should_sync = true;
560:     }
561:   }
562:   // update the offloadmask if we intend to write, since we assume immediately modified
563:   if (PetscMemoryAccessWrite(access)) {
564:     PetscCall(VecSetErrorIfLocked(v, 1));
565:     // REVIEW ME: this should probably also call PetscObjectStateIncrease() since we assume it
566:     // is immediately modified
567:     mask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
568:   }
569:   // if we are a globally blocking stream and we have MOVED data then we should synchronize,
570:   // since even doing async calls on the NULL stream is not synchronous
571:   if (!force && should_sync) PetscCall(PetscDeviceContextSynchronize(dctx));
572:   PetscFunctionReturn(PETSC_SUCCESS);
573: }

575: // v->ops->getarray[read|write] or VecCUPMGetArray[Read|Write]()
576: template <device::cupm::DeviceType T, typename D>
577: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
578: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArray(Vec v, PetscScalar **a) noexcept
579: {
580:   PetscDeviceContext dctx;

582:   PetscFunctionBegin;
583:   PetscCall(GetHandles_(&dctx));
584:   PetscCall(D::template GetArray<mtype, access, force>(v, a, dctx));
585:   PetscFunctionReturn(PETSC_SUCCESS);
586: }

588: // private version that takes a PetscDeviceContext, called by the public variant
589: template <device::cupm::DeviceType T, typename D>
590: template <PetscMemType mtype, PetscMemoryAccessMode access>
591: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArray(Vec v, PetscScalar **a, PetscDeviceContext) noexcept
592: {
593:   PetscFunctionBegin;
594:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
595:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
596:   if (PetscMemoryAccessWrite(access)) {
597:     // WRITE or READ_WRITE
598:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
599:     v->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
600:   }
601:   if (a) {
602:     PetscCall(CheckPointerMatchesMemType_(*a, mtype));
603:     *a = nullptr;
604:   }
605:   PetscFunctionReturn(PETSC_SUCCESS);
606: }

608: // v->ops->restorearray[read|write] or VecCUPMRestoreArray[Read|Write]()
609: template <device::cupm::DeviceType T, typename D>
610: template <PetscMemType mtype, PetscMemoryAccessMode access>
611: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArray(Vec v, PetscScalar **a) noexcept
612: {
613:   PetscDeviceContext dctx;

615:   PetscFunctionBegin;
616:   PetscCall(GetHandles_(&dctx));
617:   PetscCall(D::template RestoreArray<mtype, access>(v, a, dctx));
618:   PetscFunctionReturn(PETSC_SUCCESS);
619: }

621: template <device::cupm::DeviceType T, typename D>
622: template <PetscMemoryAccessMode access>
623: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrayAndMemtype(Vec v, PetscScalar **a, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
624: {
625:   PetscFunctionBegin;
626:   PetscCall(D::template GetArray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
627:   if (mtype) *mtype = (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUPM();
628:   PetscFunctionReturn(PETSC_SUCCESS);
629: }

631: // v->ops->getarrayandmemtype
632: template <device::cupm::DeviceType T, typename D>
633: template <PetscMemoryAccessMode access>
634: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrayAndMemtype(Vec v, PetscScalar **a, PetscMemType *mtype) noexcept
635: {
636:   PetscDeviceContext dctx;

638:   PetscFunctionBegin;
639:   PetscCall(GetHandles_(&dctx));
640:   PetscCall(D::template GetArrayAndMemtype<access>(v, a, mtype, dctx));
641:   PetscFunctionReturn(PETSC_SUCCESS);
642: }

644: template <device::cupm::DeviceType T, typename D>
645: template <PetscMemoryAccessMode access>
646: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArrayAndMemtype(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
647: {
648:   PetscFunctionBegin;
649:   PetscCall(D::template RestoreArray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
650:   PetscFunctionReturn(PETSC_SUCCESS);
651: }

653: // v->ops->restorearrayandmemtype
654: template <device::cupm::DeviceType T, typename D>
655: template <PetscMemoryAccessMode access>
656: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArrayAndMemtype(Vec v, PetscScalar **a) noexcept
657: {
658:   PetscDeviceContext dctx;

660:   PetscFunctionBegin;
661:   PetscCall(GetHandles_(&dctx));
662:   PetscCall(D::template RestoreArrayAndMemtype<access>(v, a, dctx));
663:   PetscFunctionReturn(PETSC_SUCCESS);
664: }

666: // v->ops->placearray or VecCUPMPlaceArray()
667: template <device::cupm::DeviceType T, typename D>
668: template <PetscMemType mtype>
669: inline PetscErrorCode Vec_CUPMBase<T, D>::PlaceArray(Vec v, const PetscScalar *a) noexcept
670: {
671:   PetscDeviceContext dctx;

673:   PetscFunctionBegin;
674:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
675:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
676:   PetscCall(CheckPointerMatchesMemType_(a, mtype));
677:   PetscCall(GetHandles_(&dctx));
678:   if (PetscMemTypeHost(mtype)) {
679:     PetscCall(CopyToHost_(dctx, v));
680:     PetscCall(VecPlaceArray_IMPL(v, a));
681:     v->offloadmask = PETSC_OFFLOAD_CPU;
682:   } else {
683:     PetscCall(VecIMPLAllocateCheck_(v));
684:     {
685:       auto &backup_array = VecIMPLCast(v)->unplacedarray;

687:       PetscCheck(!backup_array, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
688:       PetscCall(CopyToDevice_(dctx, v));
689:       PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
690:       backup_array = util::exchange(VecCUPMCast(v)->array_d, const_cast<PetscScalar *>(a));
691:       // only update the offload mask if we actually assign a pointer
692:       if (a) v->offloadmask = PETSC_OFFLOAD_GPU;
693:     }
694:   }
695:   PetscFunctionReturn(PETSC_SUCCESS);
696: }

698: // v->ops->replacearray or VecCUPMReplaceArray()
699: template <device::cupm::DeviceType T, typename D>
700: template <PetscMemType mtype>
701: inline PetscErrorCode Vec_CUPMBase<T, D>::ReplaceArray(Vec v, const PetscScalar *a) noexcept
702: {
703:   const auto         aptr = const_cast<PetscScalar *>(a);
704:   PetscDeviceContext dctx;

706:   PetscFunctionBegin;
707:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
708:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
709:   PetscCall(CheckPointerMatchesMemType_(a, mtype));
710:   PetscCall(GetHandles_(&dctx));
711:   if (PetscMemTypeHost(mtype)) {
712:     PetscCall(VecIMPLAllocateCheck_(v));
713:     {
714:       const auto vimpl      = VecIMPLCast(v);
715:       auto      &host_array = vimpl->array_allocated;

717:       // make sure the users array has the latest values.
718:       // REVIEW ME: why? we're about to free it
719:       if (host_array != vimpl->array) PetscCall(CopyToHost_(dctx, v));
720:       if (host_array) {
721:         const auto useit = UseCUPMHostAlloc(v->pinned_memory);

723:         PetscCall(PetscFree(host_array));
724:       }
725:       host_array       = aptr;
726:       vimpl->array     = host_array;
727:       v->pinned_memory = PETSC_FALSE; // REVIEW ME: we can determine this
728:       v->offloadmask   = PETSC_OFFLOAD_CPU;
729:     }
730:   } else {
731:     PetscCall(VecCUPMAllocateCheck_(v));
732:     {
733:       const auto vcu = VecCUPMCast(v);

735:       PetscCall(ResetAllocatedDevicePtr_(dctx, v, aptr));
736:       // don't update the offloadmask if placed pointer is NULL
737:       vcu->array_d = vcu->array_allocated_d /* = aptr */;
738:       if (aptr) v->offloadmask = PETSC_OFFLOAD_GPU;
739:     }
740:   }
741:   PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
742:   PetscFunctionReturn(PETSC_SUCCESS);
743: }

745: // v->ops->resetarray or VecCUPMResetArray()
746: template <device::cupm::DeviceType T, typename D>
747: template <PetscMemType mtype>
748: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetArray(Vec v) noexcept
749: {
750:   PetscDeviceContext dctx;

752:   PetscFunctionBegin;
753:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
754:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
755:   PetscCall(GetHandles_(&dctx));
756:   // REVIEW ME:
757:   // this is wildly inefficient but must be done if we assume that the placed array must have
758:   // correct values
759:   if (PetscMemTypeHost(mtype)) {
760:     PetscCall(CopyToHost_(dctx, v));
761:     PetscCall(VecResetArray_IMPL(v));
762:     v->offloadmask = PETSC_OFFLOAD_CPU;
763:   } else {
764:     PetscCall(VecIMPLAllocateCheck_(v));
765:     PetscCall(VecCUPMAllocateCheck_(v));
766:     {
767:       const auto vcu        = VecCUPMCast(v);
768:       const auto vimpl      = VecIMPLCast(v);
769:       auto      &host_array = vimpl->unplacedarray;

771:       PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_DEVICE));
772:       PetscCall(CopyToDevice_(dctx, v));
773:       PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
774:       // Need to reset the offloadmask. If we had a stashed pointer we are on the GPU,
775:       // otherwise check if the host has a valid pointer. If neither, then we are not
776:       // allocated.
777:       vcu->array_d = host_array;
778:       if (host_array) {
779:         host_array     = nullptr;
780:         v->offloadmask = PETSC_OFFLOAD_GPU;
781:       } else if (vimpl->array) {
782:         v->offloadmask = PETSC_OFFLOAD_CPU;
783:       } else {
784:         v->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
785:       }
786:     }
787:   }
788:   PetscFunctionReturn(PETSC_SUCCESS);
789: }

791: // v->ops->create
792: template <device::cupm::DeviceType T, typename D>
793: inline PetscErrorCode Vec_CUPMBase<T, D>::Create(Vec v) noexcept
794: {
795:   PetscBool          alloc_missing;
796:   PetscDeviceContext dctx;

798:   PetscFunctionBegin;
799:   PetscCall(VecCreate_IMPL_Private(v, &alloc_missing));
800:   PetscCall(GetHandles_(&dctx));
801:   PetscCall(Initialize_CUPMBase(v, alloc_missing, nullptr, nullptr, dctx));
802:   PetscFunctionReturn(PETSC_SUCCESS);
803: }

805: // v->ops->destroy
806: template <device::cupm::DeviceType T, typename D>
807: inline PetscErrorCode Vec_CUPMBase<T, D>::Destroy(Vec v) noexcept
808: {
809:   PetscFunctionBegin;
810:   if (const auto vcu = VecCUPMCast(v)) {
811:     PetscDeviceContext dctx;

813:     PetscCall(GetHandles_(&dctx));
814:     PetscCall(ResetAllocatedDevicePtr_(dctx, v));
815:     PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
816:     PetscCall(PetscFree(v->spptr));
817:   }
818:   PetscCall(PetscObjectSAWsViewOff(PetscObjectCast(v)));
819:   if (const auto vimpl = VecIMPLCast(v)) {
820:     if (auto &array_allocated = vimpl->array_allocated) {
821:       const auto useit = UseCUPMHostAlloc(v->pinned_memory);

823:       // do this ourselves since we may want to use the cupm functions
824:       PetscCall(PetscFree(array_allocated));
825:     }
826:   }
827:   v->pinned_memory = PETSC_FALSE;
828:   PetscCall(VecDestroy_IMPL(v));
829:   PetscFunctionReturn(PETSC_SUCCESS);
830: }

832: // ================================================================================== //
833: //                      Common core between Seq and MPI                               //

835: // VecCreate_CUPM()
836: template <device::cupm::DeviceType T, typename D>
837: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPM(Vec v) noexcept
838: {
839:   PetscMPIInt size;

841:   PetscFunctionBegin;
842:   PetscCallMPI(MPI_Comm_size(PetscObjectComm(PetscObjectCast(v)), &size));
843:   PetscCall(VecSetType(v, size > 1 ? VECMPICUPM() : VECSEQCUPM()));
844:   PetscFunctionReturn(PETSC_SUCCESS);
845: }

847: // VecCreateCUPM()
848: template <device::cupm::DeviceType T, typename D>
849: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPMBase(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, Vec *v, PetscBool call_set_type, PetscLayout reference) noexcept
850: {
851:   PetscFunctionBegin;
852:   PetscCall(VecCreate(comm, v));
853:   if (reference) PetscCall(PetscLayoutReference(reference, &(*v)->map));
854:   PetscCall(VecSetSizes(*v, n, N));
855:   if (bs) PetscCall(VecSetBlockSize(*v, bs));
856:   if (call_set_type) PetscCall(VecSetType(*v, VECIMPLCUPM()));
857:   PetscFunctionReturn(PETSC_SUCCESS);
858: }

860: // VecCreateIMPL_CUPM(), called through v->ops->create
861: template <device::cupm::DeviceType T, typename D>
862: inline PetscErrorCode Vec_CUPMBase<T, D>::Initialize_CUPMBase(Vec v, PetscBool allocate_missing, PetscScalar *host_array, PetscScalar *device_array, PetscDeviceContext dctx) noexcept
863: {
864:   PetscFunctionBegin;
865:   // REVIEW ME: perhaps not needed
866:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
867:   PetscCall(PetscObjectChangeTypeName(PetscObjectCast(v), VECIMPLCUPM()));
868:   PetscCall(D::BindToCPU(v, PETSC_FALSE));
869:   if (device_array) {
870:     PetscCall(CheckPointerMatchesMemType_(device_array, PETSC_MEMTYPE_CUPM()));
871:     PetscCall(VecCUPMAllocateCheck_(v));
872:     VecCUPMCast(v)->array_d = device_array;
873:   }
874:   if (host_array) {
875:     PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_HOST));
876:     VecIMPLCast(v)->array = host_array;
877:   }
878:   if (allocate_missing) {
879:     PetscCall(DeviceAllocateCheck_(dctx, v));
880:     PetscCall(HostAllocateCheck_(dctx, v));
881:     // REVIEW ME: junchao, is this needed with new calloc() branch? VecSet() will call
882:     // set() for reference
883:     // calls device-version
884:     PetscCall(VecSet(v, 0));
885:     // zero the host while device is underway
886:     PetscCall(PetscArrayzero(VecIMPLCast(v)->array, v->map->n));
887:     v->offloadmask = PETSC_OFFLOAD_BOTH;
888:   } else {
889:     if (host_array) {
890:       v->offloadmask = device_array ? PETSC_OFFLOAD_BOTH : PETSC_OFFLOAD_CPU;
891:     } else {
892:       v->offloadmask = device_array ? PETSC_OFFLOAD_GPU : PETSC_OFFLOAD_UNALLOCATED;
893:     }
894:   }
895:   PetscFunctionReturn(PETSC_SUCCESS);
896: }

898: // v->ops->duplicate
899: template <device::cupm::DeviceType T, typename D>
900: template <typename SetupFunctionT>
901: inline PetscErrorCode Vec_CUPMBase<T, D>::Duplicate_CUPMBase(Vec v, Vec *y, PetscDeviceContext dctx, SetupFunctionT &&DerivedCreateIMPLCUPM_Async) noexcept
902: {
903:   // if the derived setup is the default no_op then we should call VecSetType()
904:   constexpr auto call_set_type = static_cast<PetscBool>(std::is_same<SetupFunctionT, no_op>::value);
905:   const auto     vobj          = PetscObjectCast(v);
906:   const auto     map           = v->map;
907:   PetscInt       bs;

909:   PetscFunctionBegin;
910:   PetscCall(VecGetBlockSize(v, &bs));
911:   PetscCall(Create_CUPMBase(PetscObjectComm(vobj), bs, map->n, map->N, y, call_set_type, map));
912:   // Derived class can set up the remainder of the data structures here
913:   PetscCall(DerivedCreateIMPLCUPM_Async(*y));
914:   // If the other vector is bound to CPU then the memcpy of the ops struct will give the
915:   // duplicated vector the host "getarray" function which does not lazily allocate the array
916:   // (as it is assumed to always exist). So we force allocation here, before we overwrite the
917:   // ops
918:   if (v->boundtocpu) PetscCall(HostAllocateCheck_(dctx, *y));
919:   // in case the user has done some VecSetOps() tomfoolery
920:   PetscCall(PetscArraycpy((*y)->ops, v->ops, 1));
921:   {
922:     const auto yobj = PetscObjectCast(*y);

924:     PetscCall(PetscObjectListDuplicate(vobj->olist, &yobj->olist));
925:     PetscCall(PetscFunctionListDuplicate(vobj->qlist, &yobj->qlist));
926:   }
927:   (*y)->stash.donotstash   = v->stash.donotstash;
928:   (*y)->stash.ignorenegidx = v->stash.ignorenegidx;
929:   (*y)->map->bs            = std::abs(v->map->bs);
930:   (*y)->bstash.bs          = v->bstash.bs;
931:   PetscFunctionReturn(PETSC_SUCCESS);
932: }

934:   #define VecSetOp_CUPM(op_name, op_host, ...) \
935:     do { \
936:       if (usehost) { \
937:         v->ops->op_name = op_host; \
938:       } else { \
939:         v->ops->op_name = __VA_ARGS__; \
940:       } \
941:     } while (0)

943: // v->ops->bindtocpu
944: template <device::cupm::DeviceType T, typename D>
945: inline PetscErrorCode Vec_CUPMBase<T, D>::BindToCPU_CUPMBase(Vec v, PetscBool usehost, PetscDeviceContext dctx) noexcept
946: {
947:   PetscFunctionBegin;
948:   v->boundtocpu = usehost;
949:   if (usehost) PetscCall(CopyToHost_(dctx, v));
950:   PetscCall(PetscStrFreeAllocpy(usehost ? PETSCRANDER48 : PETSCDEVICERAND(), &v->defaultrandtype));

952:   // set the base functions that are guaranteed to be the same for both
953:   v->ops->duplicate = D::Duplicate;
954:   v->ops->create    = D::Create;
955:   v->ops->destroy   = D::Destroy;
956:   v->ops->bindtocpu = D::BindToCPU;
957:   // Note that setting these to NULL on host breaks convergence in certain areas. I don't know
958:   // why, and I don't know how, but it is IMPERATIVE these are set as such!
959:   v->ops->replacearray = D::template ReplaceArray<PETSC_MEMTYPE_HOST>;
960:   v->ops->restorearray = D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>;

962:   // set device-only common functions
963:   VecSetOp_CUPM(dotnorm2, nullptr, D::DotNorm2);
964:   VecSetOp_CUPM(getarray, nullptr, D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
965:   VecSetOp_CUPM(getarraywrite, nullptr, D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
966:   VecSetOp_CUPM(restorearraywrite, nullptr, D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);

968:   VecSetOp_CUPM(getarrayread, nullptr, [](Vec v, const PetscScalar **a) { return D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
969:   VecSetOp_CUPM(restorearrayread, nullptr, [](Vec v, const PetscScalar **a) { return D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });

971:   VecSetOp_CUPM(getarrayandmemtype, nullptr, D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
972:   VecSetOp_CUPM(restorearrayandmemtype, nullptr, D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);

974:   VecSetOp_CUPM(getarraywriteandmemtype, nullptr, D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_WRITE>);
975:   VecSetOp_CUPM(restorearraywriteandmemtype, nullptr, [](Vec v, PetscScalar **a, PetscMemType *) { return D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_WRITE>(v, a); });

977:   VecSetOp_CUPM(getarrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a, PetscMemType *m) { return D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a), m); });
978:   VecSetOp_CUPM(restorearrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a) { return D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });

980:   // set the functions that are always sequential
981:   using VecSeq_T = VecSeq_CUPM<T>;
982:   VecSetOp_CUPM(scale, VecScale_Seq, VecSeq_T::Scale);
983:   VecSetOp_CUPM(copy, VecCopy_Seq, VecSeq_T::Copy);
984:   VecSetOp_CUPM(set, VecSet_Seq, VecSeq_T::Set);
985:   VecSetOp_CUPM(swap, VecSwap_Seq, VecSeq_T::Swap);
986:   VecSetOp_CUPM(axpy, VecAXPY_Seq, VecSeq_T::AXPY);
987:   VecSetOp_CUPM(axpby, VecAXPBY_Seq, VecSeq_T::AXPBY);
988:   VecSetOp_CUPM(maxpy, VecMAXPY_Seq, VecSeq_T::MAXPY);
989:   VecSetOp_CUPM(aypx, VecAYPX_Seq, VecSeq_T::AYPX);
990:   VecSetOp_CUPM(waxpy, VecWAXPY_Seq, VecSeq_T::WAXPY);
991:   VecSetOp_CUPM(axpbypcz, VecAXPBYPCZ_Seq, VecSeq_T::AXPBYPCZ);
992:   VecSetOp_CUPM(pointwisemult, VecPointwiseMult_Seq, VecSeq_T::PointwiseMult);
993:   VecSetOp_CUPM(pointwisedivide, VecPointwiseDivide_Seq, VecSeq_T::PointwiseDivide);
994:   VecSetOp_CUPM(setrandom, VecSetRandom_Seq, VecSeq_T::SetRandom);
995:   VecSetOp_CUPM(dot_local, VecDot_Seq, VecSeq_T::Dot);
996:   VecSetOp_CUPM(tdot_local, VecTDot_Seq, VecSeq_T::TDot);
997:   VecSetOp_CUPM(norm_local, VecNorm_Seq, VecSeq_T::Norm);
998:   VecSetOp_CUPM(mdot_local, VecMDot_Seq, VecSeq_T::MDot);
999:   VecSetOp_CUPM(reciprocal, VecReciprocal_Default, VecSeq_T::Reciprocal);
1000:   VecSetOp_CUPM(shift, nullptr, VecSeq_T::Shift);
1001:   VecSetOp_CUPM(getlocalvector, nullptr, VecSeq_T::template GetLocalVector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1002:   VecSetOp_CUPM(restorelocalvector, nullptr, VecSeq_T::template RestoreLocalVector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1003:   VecSetOp_CUPM(getlocalvectorread, nullptr, VecSeq_T::template GetLocalVector<PETSC_MEMORY_ACCESS_READ>);
1004:   VecSetOp_CUPM(restorelocalvectorread, nullptr, VecSeq_T::template RestoreLocalVector<PETSC_MEMORY_ACCESS_READ>);
1005:   VecSetOp_CUPM(sum, nullptr, VecSeq_T::Sum);
1006:   PetscFunctionReturn(PETSC_SUCCESS);
1007: }

1009: // Called from VecGetSubVector()
1010: template <device::cupm::DeviceType T, typename D>
1011: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrays_CUPMBase(Vec v, const PetscScalar **host_array, const PetscScalar **device_array, PetscOffloadMask *mask, PetscDeviceContext dctx) noexcept
1012: {
1013:   PetscFunctionBegin;
1014:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1015:   if (host_array) {
1016:     PetscCall(HostAllocateCheck_(dctx, v));
1017:     *host_array = VecIMPLCast(v)->array;
1018:   }
1019:   if (device_array) {
1020:     PetscCall(DeviceAllocateCheck_(dctx, v));
1021:     *device_array = VecCUPMCast(v)->array_d;
1022:   }
1023:   if (mask) *mask = v->offloadmask;
1024:   PetscFunctionReturn(PETSC_SUCCESS);
1025: }

1027: template <device::cupm::DeviceType T, typename D>
1028: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetPreallocationCOO_CUPMBase(Vec v, PetscDeviceContext dctx) noexcept
1029: {
1030:   PetscFunctionBegin;
1031:   if (const auto vcu = VecCUPMCast(v)) {
1032:     cupmStream_t stream;
1033:     // clang-format off
1034:     const auto   cntptrs = util::make_array(
1035:       std::ref(vcu->jmap1_d),
1036:       std::ref(vcu->perm1_d),
1037:       std::ref(vcu->imap2_d),
1038:       std::ref(vcu->jmap2_d),
1039:       std::ref(vcu->perm2_d),
1040:       std::ref(vcu->Cperm_d)
1041:     );
1042:     // clang-format on

1044:     PetscCall(GetHandlesFrom_(dctx, &stream));
1045:     for (auto &&ptr : cntptrs) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1046:     for (auto &&ptr : util::make_array(std::ref(vcu->sendbuf_d), std::ref(vcu->recvbuf_d))) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1047:   }
1048:   PetscFunctionReturn(PETSC_SUCCESS);
1049: }

1051: template <device::cupm::DeviceType T, typename D>
1052: template <std::size_t NCount, std::size_t NScal>
1053: inline PetscErrorCode Vec_CUPMBase<T, D>::SetPreallocationCOO_CUPMBase(Vec v, PetscCount, const PetscInt[], PetscDeviceContext dctx, const std::array<CooPair<PetscCount>, NCount> &extra_cntptrs, const std::array<CooPair<PetscScalar>, NScal> &bufptrs) noexcept
1054: {
1055:   PetscFunctionBegin;
1056:   PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1057:   // need to instantiate the private pointer if not already
1058:   PetscCall(VecCUPMAllocateCheck_(v));
1059:   {
1060:     const auto vimpl = VecIMPLCast(v);
1061:     const auto vcu   = VecCUPMCast(v);
1062:     // clang-format off
1063:     const auto cntptrs = util::concat_array(
1064:       util::make_array(
1065:         make_coo_pair(vcu->jmap1_d, vimpl->jmap1, v->map->n + 1),
1066:         make_coo_pair(vcu->perm1_d, vimpl->perm1, vimpl->tot1)
1067:       ),
1068:       extra_cntptrs
1069:     );
1070:     // clang-format on
1071:     cupmStream_t stream;

1073:     PetscCall(GetHandlesFrom_(dctx, &stream));
1074:     // allocate
1075:     for (auto &elem : cntptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1076:     for (auto &elem : bufptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1077:     // copy
1078:     for (const auto &elem : cntptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1079:     for (const auto &elem : bufptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1080:   }
1081:   PetscFunctionReturn(PETSC_SUCCESS);
1082: }

1084:   #define PETSC_VEC_CUPM_BASE_CLASS_HEADER(name, Tp, ...) \
1085:     PETSC_CUPMOBJECT_HEADER(Tp); \
1086:     using name = ::Petsc::vec::cupm::impl::Vec_CUPMBase<Tp, __VA_ARGS__>; \
1087:     friend name; \
1088:     /* introspection */ \
1089:     using name::VecCUPMCast; \
1090:     using name::VecIMPLCast; \
1091:     using name::VECIMPLCUPM; \
1092:     using name::VECSEQCUPM; \
1093:     using name::VECMPICUPM; \
1094:     using name::VECCUPM; \
1095:     using name::VecView_Debug; \
1096:     /* utility */ \
1097:     using typename name::Vec_CUPM; \
1098:     using name::VecCUPMAllocateCheck_; \
1099:     using name::VecIMPLAllocateCheck_; \
1100:     using name::HostAllocateCheck_; \
1101:     using name::DeviceAllocateCheck_; \
1102:     using name::CopyToDevice_; \
1103:     using name::CopyToHost_; \
1104:     using name::Create; \
1105:     using name::Destroy; \
1106:     using name::GetArray; \
1107:     using name::RestoreArray; \
1108:     using name::GetArrayAndMemtype; \
1109:     using name::RestoreArrayAndMemtype; \
1110:     using name::PlaceArray; \
1111:     using name::ReplaceArray; \
1112:     using name::ResetArray; \
1113:     /* base functions */ \
1114:     using name::Create_CUPMBase; \
1115:     using name::Initialize_CUPMBase; \
1116:     using name::Duplicate_CUPMBase; \
1117:     using name::BindToCPU_CUPMBase; \
1118:     using name::Create_CUPM; \
1119:     using name::DeviceArrayRead; \
1120:     using name::DeviceArrayWrite; \
1121:     using name::DeviceArrayReadWrite; \
1122:     using name::HostArrayRead; \
1123:     using name::HostArrayWrite; \
1124:     using name::HostArrayReadWrite; \
1125:     using name::ResetPreallocationCOO_CUPMBase; \
1126:     using name::SetPreallocationCOO_CUPMBase

1128: } // namespace impl

1130: } // namespace cupm

1132: } // namespace vec

1134: } // namespace Petsc

1136: #endif // __cplusplus && PetscDefined(HAVE_DEVICE)

1138: #endif // PETSCVECCUPMIMPL_H