Actual source code: veccupmimpl.h
1: #ifndef PETSCVECCUPMIMPL_H
2: #define PETSCVECCUPMIMPL_H
4: #include <petsc/private/vecimpl.h>
5: #include <../src/vec/vec/impls/dvecimpl.h>
7: #if PetscDefined(HAVE_NVSHMEM)
8: PETSC_INTERN PetscErrorCode PetscNvshmemInitializeCheck(void);
9: PETSC_INTERN PetscErrorCode PetscNvshmemMalloc(size_t, void **);
10: PETSC_INTERN PetscErrorCode PetscNvshmemCalloc(size_t, void **);
11: PETSC_INTERN PetscErrorCode PetscNvshmemFree_Private(void *);
12: #define PetscNvshmemFree(ptr) ((PetscErrorCode)((ptr) && (PetscNvshmemFree_Private(ptr) || ((ptr) = PETSC_NULLPTR, PETSC_SUCCESS))))
13: PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt, PetscScalar *, const PetscScalar *);
14: PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt, PetscReal *, const PetscReal *);
15: PETSC_INTERN PetscErrorCode VecNormAsync_NVSHMEM(Vec, NormType, PetscReal *);
16: PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec);
17: #else
18: #define PetscNvshmemFree(ptr) PETSC_SUCCESS
19: #endif
21: #if defined(__cplusplus) && PetscDefined(HAVE_DEVICE)
22: #include <petsc/private/deviceimpl.h>
23: #include <petsc/private/cupmobject.hpp>
24: #include <petsc/private/cupmblasinterface.hpp>
26: #include <petsc/private/cpp/functional.hpp>
28: #include <limits> // std::numeric_limits
30: namespace Petsc
31: {
33: namespace vec
34: {
36: namespace cupm
37: {
39: namespace impl
40: {
42: namespace
43: {
45: struct no_op {
46: template <typename... T>
47: constexpr PetscErrorCode operator()(T &&...) const noexcept
48: {
49: return PETSC_SUCCESS;
50: }
51: };
53: template <typename T>
54: struct CooPair {
55: using value_type = T;
56: using size_type = PetscCount;
58: value_type *&device;
59: value_type *&host;
60: size_type size;
61: };
63: template <typename U>
64: static constexpr CooPair<U> make_coo_pair(U *&device, U *&host, PetscCount size) noexcept
65: {
66: return {device, host, size};
67: }
69: } // anonymous namespace
71: // forward declarations
72: template <device::cupm::DeviceType>
73: class VecSeq_CUPM;
74: template <device::cupm::DeviceType>
75: class VecMPI_CUPM;
77: // ==========================================================================================
78: // Vec_CUPMBase
79: //
80: // Base class for the VecSeq and VecMPI CUPM implementations. On top of the usual DeviceType
81: // template parameter it also uses CRTP to be able to use values/calls specific to either
82: // VecSeq or VecMPI. This is in effect "inside-out" polymorphism.
83: // ==========================================================================================
84: template <device::cupm::DeviceType T, typename Derived>
85: class Vec_CUPMBase : protected device::cupm::impl::CUPMObject<T> {
86: public:
87: PETSC_CUPMOBJECT_HEADER(T);
89: // ==========================================================================================
90: // Vec_CUPMBase::VectorArray
91: //
92: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
93: // holds the pointer itself provides the implicit conversion operator
94: // ==========================================================================================
95: template <PetscMemType, PetscMemoryAccessMode>
96: class VectorArray;
98: protected:
99: static PetscErrorCode VecView_Debug(Vec v, const char *message = "") noexcept
100: {
101: const auto pobj = PetscObjectCast(v);
102: const auto vimpl = VecIMPLCast(v);
103: const auto vcu = VecCUPMCast(v);
104: PetscMemType mtype;
105: MPI_Comm comm;
107: PetscFunctionBegin;
110: PetscCall(PetscObjectGetComm(pobj, &comm));
111: PetscCall(PetscPrintf(comm, "---------- %s ----------\n", message));
112: PetscCall(PetscObjectPrintClassNamePrefixType(pobj, PETSC_VIEWER_STDOUT_(comm)));
113: PetscCall(PetscPrintf(comm, "Address: %p\n", v));
114: PetscCall(PetscPrintf(comm, "Size: %" PetscInt_FMT "\n", v->map->n));
115: PetscCall(PetscPrintf(comm, "Offload mask: %s\n", PetscOffloadMaskToString(v->offloadmask)));
116: PetscCall(PetscPrintf(comm, "Host ptr: %p\n", vimpl->array));
117: PetscCall(PetscPrintf(comm, "Device ptr: %p\n", vcu->array_d));
118: PetscCall(PetscPrintf(comm, "Device alloced ptr: %p\n", vcu->array_allocated_d));
119: PetscCall(PetscCUPMGetMemType(vcu->array_d, &mtype));
120: PetscCall(PetscPrintf(comm, "dptr is device mem? %s\n", PetscBools[static_cast<PetscBool>(PetscMemTypeDevice(mtype))]));
121: PetscFunctionReturn(PETSC_SUCCESS);
122: }
124: // Delete the allocated device array if required and replace it with the given array
125: static PetscErrorCode ResetAllocatedDevicePtr_(PetscDeviceContext, Vec, PetscScalar * = nullptr) noexcept;
126: // Check either the host or device impl pointer is allocated and allocate it if
127: // isn't. CastFunctionType casts the Vec to the required type and returns the pointer
128: template <typename CastFunctionType>
129: static PetscErrorCode VecAllocateCheck_(Vec, void *&, CastFunctionType &&) noexcept;
130: // Check the CUPM part (v->spptr) is allocated, otherwise allocate it
131: static PetscErrorCode VecCUPMAllocateCheck_(Vec) noexcept;
132: // Check the Host part (v->data) is allocated, otherwise allocate it
133: static PetscErrorCode VecIMPLAllocateCheck_(Vec) noexcept;
134: // Check the Host array is allocated, otherwise allocate it
135: static PetscErrorCode HostAllocateCheck_(PetscDeviceContext, Vec) noexcept;
136: // Check the CUPM array is allocated, otherwise allocate it
137: static PetscErrorCode DeviceAllocateCheck_(PetscDeviceContext, Vec) noexcept;
138: // Copy HTOD, allocating device if necessary
139: static PetscErrorCode CopyToDevice_(PetscDeviceContext, Vec, bool = false) noexcept;
140: // Copy DTOH, allocating host if necessary
141: static PetscErrorCode CopyToHost_(PetscDeviceContext, Vec, bool = false) noexcept;
143: public:
144: struct Vec_CUPM {
145: PetscScalar *array_d; // gpu data
146: PetscScalar *array_allocated_d; // does PETSc own the array ptr?
147: PetscBool nvshmem; // is array allocated in nvshmem? It is used to allocate
148: // Mvctx->lvec in nvshmem
150: // COO stuff
151: PetscCount *jmap1_d; // [m+1]: i-th entry of the vector has jmap1[i+1]-jmap1[i] repeats
152: // in COO arrays
153: PetscCount *perm1_d; // [tot1]: permutation array for local entries
154: PetscCount *imap2_d; // [nnz2]: i-th unique entry in recvbuf is imap2[i]-th entry in
155: // the vector
156: PetscCount *jmap2_d; // [nnz2+1]
157: PetscCount *perm2_d; // [recvlen]
158: PetscCount *Cperm_d; // [sendlen]: permutation array to fill sendbuf[]. 'C' for
159: // communication
161: // Buffers for remote values in VecSetValuesCOO()
162: PetscScalar *sendbuf_d;
163: PetscScalar *recvbuf_d;
164: };
166: // Cast the Vec to its Vec_CUPM struct, i.e. return the result of (Vec_CUPM *)v->spptr
167: PETSC_NODISCARD static Vec_CUPM *VecCUPMCast(Vec) noexcept;
168: // Cast the Vec to its host struct, i.e. return the result of (Vec_Seq *)v->data
169: template <typename U = Derived>
170: PETSC_NODISCARD static constexpr auto VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v));
171: // Get the PetscLogEvents for HTOD and DTOH
172: PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyToGPU() noexcept;
173: PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyFromGPU() noexcept;
174: // Get the VecTypes
175: PETSC_NODISCARD static constexpr VecType VECSEQCUPM() noexcept;
176: PETSC_NODISCARD static constexpr VecType VECMPICUPM() noexcept;
177: PETSC_NODISCARD static constexpr VecType VECCUPM() noexcept;
179: // Get the VecType of the calling vector
180: template <typename U = Derived>
181: PETSC_NODISCARD static constexpr VecType VECIMPLCUPM() noexcept;
183: // Call the host destroy function, i.e. VecDestroy_Seq()
184: static PetscErrorCode VecDestroy_IMPL(Vec) noexcept;
185: // Call the host reset function, i.e. VecResetArray_Seq()
186: static PetscErrorCode VecResetArray_IMPL(Vec) noexcept;
187: // ... you get the idea
188: static PetscErrorCode VecPlaceArray_IMPL(Vec, const PetscScalar *) noexcept;
189: // Call the host creation function, i.e. VecCreate_Seq(), and also initialize the CUPM part
190: // along with it if needed
191: static PetscErrorCode VecCreate_IMPL_Private(Vec, PetscBool *, PetscInt = 0, PetscScalar * = nullptr) noexcept;
193: // Shorthand for creating VectorArray's. Need functions to create them, otherwise using them
194: // as an unnamed temporary leads to most vexing parse
195: PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, v});
196: PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
197: PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
198: PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, v});
199: PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
200: PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
202: // ops-table functions
203: static PetscErrorCode Create(Vec) noexcept;
204: static PetscErrorCode Destroy(Vec) noexcept;
205: template <PetscMemType, PetscMemoryAccessMode, bool = false>
206: static PetscErrorCode GetArray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
207: template <PetscMemType, PetscMemoryAccessMode, bool = false>
208: static PetscErrorCode GetArray(Vec, PetscScalar **) noexcept;
209: template <PetscMemType, PetscMemoryAccessMode>
210: static PetscErrorCode RestoreArray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
211: template <PetscMemType, PetscMemoryAccessMode>
212: static PetscErrorCode RestoreArray(Vec, PetscScalar **) noexcept;
213: template <PetscMemoryAccessMode>
214: static PetscErrorCode GetArrayAndMemtype(Vec, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
215: template <PetscMemoryAccessMode>
216: static PetscErrorCode GetArrayAndMemtype(Vec, PetscScalar **, PetscMemType *) noexcept;
217: template <PetscMemoryAccessMode>
218: static PetscErrorCode RestoreArrayAndMemtype(Vec, PetscScalar **, PetscDeviceContext) noexcept;
219: template <PetscMemoryAccessMode>
220: static PetscErrorCode RestoreArrayAndMemtype(Vec, PetscScalar **) noexcept;
221: template <PetscMemType>
222: static PetscErrorCode ReplaceArray(Vec, const PetscScalar *) noexcept;
223: template <PetscMemType>
224: static PetscErrorCode ResetArray(Vec) noexcept;
225: template <PetscMemType>
226: static PetscErrorCode PlaceArray(Vec, const PetscScalar *) noexcept;
228: // common ops shared between Seq and MPI
229: static PetscErrorCode Create_CUPM(Vec) noexcept;
230: static PetscErrorCode Create_CUPMBase(MPI_Comm, PetscInt, PetscInt, PetscInt, Vec *, PetscBool, PetscLayout /*reference*/ = nullptr) noexcept;
231: static PetscErrorCode Initialize_CUPMBase(Vec, PetscBool, PetscScalar *, PetscScalar *, PetscDeviceContext) noexcept;
232: template <typename SetupFunctionT = no_op>
233: static PetscErrorCode Duplicate_CUPMBase(Vec, Vec *, PetscDeviceContext, SetupFunctionT && = SetupFunctionT{}) noexcept;
234: static PetscErrorCode BindToCPU_CUPMBase(Vec, PetscBool, PetscDeviceContext) noexcept;
235: static PetscErrorCode GetArrays_CUPMBase(Vec, const PetscScalar **, const PetscScalar **, PetscOffloadMask *, PetscDeviceContext) noexcept;
236: static PetscErrorCode ResetPreallocationCOO_CUPMBase(Vec, PetscDeviceContext) noexcept;
237: template <std::size_t NCount = 0, std::size_t NScal = 0>
238: static PetscErrorCode SetPreallocationCOO_CUPMBase(Vec, PetscCount, const PetscInt[], PetscDeviceContext, const std::array<CooPair<PetscCount>, NCount> & = {}, const std::array<CooPair<PetscScalar>, NScal> & = {}) noexcept;
239: };
241: // ==========================================================================================
242: // Vec_CUPMBase::VectorArray
243: //
244: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
245: // holds the pointer itself and provides the implicit conversion operator.
246: //
247: // On construction this calls the moral equivalent of Vec[CUPM]GetArray[Read|Write]()
248: // (depending on PetscMemoryAccessMode) and on destruction automatically restores the array
249: // for you
250: // ==========================================================================================
251: template <device::cupm::DeviceType T, typename D>
252: template <PetscMemType MT, PetscMemoryAccessMode MA>
253: class Vec_CUPMBase<T, D>::VectorArray : public device::cupm::impl::RestoreableArray<T, MT, MA> {
254: using base_type = device::cupm::impl::RestoreableArray<T, MT, MA>;
256: public:
257: VectorArray(PetscDeviceContext, Vec) noexcept;
258: ~VectorArray() noexcept;
260: private:
261: Vec v_ = nullptr;
262: };
264: // ==========================================================================================
265: // Vec_CUPMBase::VectorArray - Public API
266: // ==========================================================================================
268: template <device::cupm::DeviceType T, typename D>
269: template <PetscMemType MT, PetscMemoryAccessMode MA>
270: inline Vec_CUPMBase<T, D>::VectorArray<MT, MA>::VectorArray(PetscDeviceContext dctx, Vec v) noexcept : base_type{dctx}, v_{v}
271: {
272: PetscFunctionBegin;
273: PetscCallAbort(PETSC_COMM_SELF, Vec_CUPMBase<T, D>::template GetArray<MT, MA, true>(v, &this->ptr_, dctx));
274: PetscFunctionReturnVoid();
275: }
277: template <device::cupm::DeviceType T, typename D>
278: template <PetscMemType MT, PetscMemoryAccessMode MA>
279: inline Vec_CUPMBase<T, D>::VectorArray<MT, MA>::~VectorArray() noexcept
280: {
281: PetscFunctionBegin;
282: PetscCallAbort(PETSC_COMM_SELF, Vec_CUPMBase<T, D>::template RestoreArray<MT, MA>(v_, &this->ptr_, this->dctx_));
283: PetscFunctionReturnVoid();
284: }
286: // ==========================================================================================
287: // Vec_CUPMBase - Protected API
288: // ==========================================================================================
290: template <device::cupm::DeviceType T, typename D>
291: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetAllocatedDevicePtr_(PetscDeviceContext dctx, Vec v, PetscScalar *new_value) noexcept
292: {
293: auto &device_array = VecCUPMCast(v)->array_allocated_d;
295: PetscFunctionBegin;
296: if (device_array) {
297: if (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) {
298: PetscCall(PetscNvshmemFree(device_array));
299: } else {
300: cupmStream_t stream;
302: PetscCall(GetHandlesFrom_(dctx, &stream));
303: PetscCallCUPM(cupmFreeAsync(device_array, stream));
304: }
305: }
306: device_array = new_value;
307: PetscFunctionReturn(PETSC_SUCCESS);
308: }
310: namespace
311: {
313: inline PetscErrorCode VecCUPMCheckMinimumPinnedMemory_Internal(Vec v) noexcept
314: {
315: auto mem = static_cast<PetscInt>(v->minimum_bytes_pinned_memory);
316: PetscBool flg;
318: PetscFunctionBegin;
319: PetscObjectOptionsBegin(PetscObjectCast(v));
320: PetscCall(PetscOptionsRangeInt("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", mem, &mem, &flg, 0, std::numeric_limits<decltype(mem)>::max()));
321: if (flg) v->minimum_bytes_pinned_memory = mem;
322: PetscOptionsEnd();
323: PetscFunctionReturn(PETSC_SUCCESS);
324: }
326: } // anonymous namespace
328: template <device::cupm::DeviceType T, typename D>
329: template <typename CastFunctionType>
330: inline PetscErrorCode Vec_CUPMBase<T, D>::VecAllocateCheck_(Vec v, void *&dest, CastFunctionType &&cast) noexcept
331: {
332: PetscFunctionBegin;
333: if (PetscLikely(dest)) PetscFunctionReturn(PETSC_SUCCESS);
334: // do the check here so we don't have to do it in every function
335: PetscCall(checkCupmBlasIntCast(v->map->n));
336: {
337: auto impl = cast(v);
339: PetscCall(PetscNew(&impl));
340: dest = impl;
341: }
342: PetscFunctionReturn(PETSC_SUCCESS);
343: }
345: template <device::cupm::DeviceType T, typename D>
346: inline PetscErrorCode Vec_CUPMBase<T, D>::VecIMPLAllocateCheck_(Vec v) noexcept
347: {
348: PetscFunctionBegin;
349: PetscCall(VecAllocateCheck_(v, v->data, VecIMPLCast<D>));
350: PetscFunctionReturn(PETSC_SUCCESS);
351: }
353: // allocate the Vec_CUPM struct. this is normally done through DeviceAllocateCheck_(), but in
354: // certain circumstances (such as when the user places the device array) we do not want to do
355: // the full DeviceAllocateCheck_() as it also allocates the array
356: template <device::cupm::DeviceType T, typename D>
357: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCUPMAllocateCheck_(Vec v) noexcept
358: {
359: PetscFunctionBegin;
360: PetscCall(VecAllocateCheck_(v, v->spptr, VecCUPMCast));
361: PetscFunctionReturn(PETSC_SUCCESS);
362: }
364: template <device::cupm::DeviceType T, typename D>
365: inline PetscErrorCode Vec_CUPMBase<T, D>::HostAllocateCheck_(PetscDeviceContext, Vec v) noexcept
366: {
367: PetscFunctionBegin;
368: PetscCall(VecIMPLAllocateCheck_(v));
369: if (auto &alloc = VecIMPLCast(v)->array_allocated) PetscFunctionReturn(PETSC_SUCCESS);
370: else {
371: PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v));
372: {
373: const auto n = v->map->n;
374: const auto useit = UseCUPMHostAlloc((n * sizeof(*alloc)) > v->minimum_bytes_pinned_memory);
376: v->pinned_memory = static_cast<decltype(v->pinned_memory)>(useit.value());
377: PetscCall(PetscMalloc1(n, &alloc));
378: }
379: if (!VecIMPLCast(v)->array) VecIMPLCast(v)->array = alloc;
380: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
381: }
382: PetscFunctionReturn(PETSC_SUCCESS);
383: }
385: template <device::cupm::DeviceType T, typename D>
386: inline PetscErrorCode Vec_CUPMBase<T, D>::DeviceAllocateCheck_(PetscDeviceContext dctx, Vec v) noexcept
387: {
388: PetscFunctionBegin;
389: PetscCall(VecCUPMAllocateCheck_(v));
390: if (auto &alloc = VecCUPMCast(v)->array_d) PetscFunctionReturn(PETSC_SUCCESS);
391: else {
392: const auto n = v->map->n;
393: auto &array_allocated_d = VecCUPMCast(v)->array_allocated_d;
394: cupmStream_t stream;
396: PetscCall(GetHandlesFrom_(dctx, &stream));
397: PetscCall(PetscCUPMMallocAsync(&array_allocated_d, n, stream));
398: alloc = array_allocated_d;
399: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
400: const auto vimp = VecIMPLCast(v);
401: v->offloadmask = (vimp && vimp->array) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
402: }
403: }
404: PetscFunctionReturn(PETSC_SUCCESS);
405: }
407: template <device::cupm::DeviceType T, typename D>
408: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToDevice_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
409: {
410: PetscFunctionBegin;
411: PetscCall(DeviceAllocateCheck_(dctx, v));
412: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
413: cupmStream_t stream;
415: v->offloadmask = PETSC_OFFLOAD_BOTH;
416: PetscCall(GetHandlesFrom_(dctx, &stream));
417: PetscCall(PetscLogEventBegin(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
418: PetscCall(PetscCUPMMemcpyAsync(VecCUPMCast(v)->array_d, VecIMPLCast(v)->array, v->map->n, cupmMemcpyHostToDevice, stream, forceasync));
419: PetscCall(PetscLogEventEnd(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
420: }
421: PetscFunctionReturn(PETSC_SUCCESS);
422: }
424: template <device::cupm::DeviceType T, typename D>
425: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToHost_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
426: {
427: PetscFunctionBegin;
428: PetscCall(HostAllocateCheck_(dctx, v));
429: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
430: cupmStream_t stream;
432: v->offloadmask = PETSC_OFFLOAD_BOTH;
433: PetscCall(GetHandlesFrom_(dctx, &stream));
434: PetscCall(PetscLogEventBegin(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
435: PetscCall(PetscCUPMMemcpyAsync(VecIMPLCast(v)->array, VecCUPMCast(v)->array_d, v->map->n, cupmMemcpyDeviceToHost, stream, forceasync));
436: PetscCall(PetscLogEventEnd(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
437: }
438: PetscFunctionReturn(PETSC_SUCCESS);
439: }
441: // ==========================================================================================
442: // Vec_CUPMBase - Public API
443: // ==========================================================================================
445: template <device::cupm::DeviceType T, typename D>
446: inline typename Vec_CUPMBase<T, D>::Vec_CUPM *Vec_CUPMBase<T, D>::VecCUPMCast(Vec v) noexcept
447: {
448: return static_cast<Vec_CUPM *>(v->spptr);
449: }
451: // This is a trick to get around the fact that in CRTP the derived class is not yet fully
452: // defined because Base<Derived> must necessarily be instantiated before Derived is
453: // complete. By using a dummy template parameter we make the type "dependent" and so will
454: // only be determined when the derived class is instantiated (and therefore fully defined)
455: template <device::cupm::DeviceType T, typename D>
456: template <typename U>
457: inline constexpr auto Vec_CUPMBase<T, D>::VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v))
458: {
459: return U::VecIMPLCast_(v);
460: }
462: template <device::cupm::DeviceType T, typename D>
463: inline PetscErrorCode Vec_CUPMBase<T, D>::VecDestroy_IMPL(Vec v) noexcept
464: {
465: return D::VecDestroy_IMPL_(v);
466: }
468: template <device::cupm::DeviceType T, typename D>
469: inline PetscErrorCode Vec_CUPMBase<T, D>::VecResetArray_IMPL(Vec v) noexcept
470: {
471: return D::VecResetArray_IMPL_(v);
472: }
474: template <device::cupm::DeviceType T, typename D>
475: inline PetscErrorCode Vec_CUPMBase<T, D>::VecPlaceArray_IMPL(Vec v, const PetscScalar *a) noexcept
476: {
477: return D::VecPlaceArray_IMPL_(v, a);
478: }
480: template <device::cupm::DeviceType T, typename D>
481: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCreate_IMPL_Private(Vec v, PetscBool *alloc_missing, PetscInt nghost, PetscScalar *host_array) noexcept
482: {
483: return D::VecCreate_IMPL_Private_(v, alloc_missing, nghost, host_array);
484: }
486: template <device::cupm::DeviceType T, typename D>
487: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyToGPU() noexcept
488: {
489: return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyToGPU : VEC_HIPCopyToGPU;
490: }
492: template <device::cupm::DeviceType T, typename D>
493: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyFromGPU() noexcept
494: {
495: return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyFromGPU : VEC_HIPCopyFromGPU;
496: }
498: template <device::cupm::DeviceType T, typename D>
499: inline constexpr VecType Vec_CUPMBase<T, D>::VECSEQCUPM() noexcept
500: {
501: return T == device::cupm::DeviceType::CUDA ? VECSEQCUDA : VECSEQHIP;
502: }
504: template <device::cupm::DeviceType T, typename D>
505: inline constexpr VecType Vec_CUPMBase<T, D>::VECMPICUPM() noexcept
506: {
507: return T == device::cupm::DeviceType::CUDA ? VECMPICUDA : VECMPIHIP;
508: }
510: template <device::cupm::DeviceType T, typename D>
511: inline constexpr VecType Vec_CUPMBase<T, D>::VECCUPM() noexcept
512: {
513: return T == device::cupm::DeviceType::CUDA ? VECCUDA : VECHIP;
514: }
516: template <device::cupm::DeviceType T, typename D>
517: template <typename U>
518: inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPLCUPM() noexcept
519: {
520: return U::VECIMPLCUPM_();
521: }
523: // private version that takes a PetscDeviceContext, called by the public variant
524: template <device::cupm::DeviceType T, typename D>
525: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
526: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArray(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
527: {
528: constexpr auto hostmem = PetscMemTypeHost(mtype);
529: const auto oldmask = v->offloadmask;
530: auto &mask = v->offloadmask;
531: auto should_sync = false;
533: PetscFunctionBegin;
534: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
535: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
536: if (PetscMemoryAccessRead(access)) {
537: // READ or READ_WRITE
538: if (((oldmask == PETSC_OFFLOAD_GPU) && hostmem) || ((oldmask == PETSC_OFFLOAD_CPU) && !hostmem)) {
539: // if we move the data we should set the flag to synchronize later on
540: should_sync = true;
541: }
542: PetscCall((hostmem ? CopyToHost_ : CopyToDevice_)(dctx, v, force));
543: } else {
544: // WRITE only
545: PetscCall((hostmem ? HostAllocateCheck_ : DeviceAllocateCheck_)(dctx, v));
546: }
547: *a = hostmem ? VecIMPLCast(v)->array : VecCUPMCast(v)->array_d;
548: // if unallocated previously we should zero things out if we intend to read
549: if (PetscMemoryAccessRead(access) && (oldmask == PETSC_OFFLOAD_UNALLOCATED)) {
550: const auto n = v->map->n;
552: if (hostmem) {
553: PetscCall(PetscArrayzero(*a, n));
554: } else {
555: cupmStream_t stream;
557: PetscCall(GetHandlesFrom_(dctx, &stream));
558: PetscCall(PetscCUPMMemsetAsync(*a, 0, n, stream, force));
559: should_sync = true;
560: }
561: }
562: // update the offloadmask if we intend to write, since we assume immediately modified
563: if (PetscMemoryAccessWrite(access)) {
564: PetscCall(VecSetErrorIfLocked(v, 1));
565: // REVIEW ME: this should probably also call PetscObjectStateIncrease() since we assume it
566: // is immediately modified
567: mask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
568: }
569: // if we are a globally blocking stream and we have MOVED data then we should synchronize,
570: // since even doing async calls on the NULL stream is not synchronous
571: if (!force && should_sync) PetscCall(PetscDeviceContextSynchronize(dctx));
572: PetscFunctionReturn(PETSC_SUCCESS);
573: }
575: // v->ops->getarray[read|write] or VecCUPMGetArray[Read|Write]()
576: template <device::cupm::DeviceType T, typename D>
577: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
578: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArray(Vec v, PetscScalar **a) noexcept
579: {
580: PetscDeviceContext dctx;
582: PetscFunctionBegin;
583: PetscCall(GetHandles_(&dctx));
584: PetscCall(D::template GetArray<mtype, access, force>(v, a, dctx));
585: PetscFunctionReturn(PETSC_SUCCESS);
586: }
588: // private version that takes a PetscDeviceContext, called by the public variant
589: template <device::cupm::DeviceType T, typename D>
590: template <PetscMemType mtype, PetscMemoryAccessMode access>
591: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArray(Vec v, PetscScalar **a, PetscDeviceContext) noexcept
592: {
593: PetscFunctionBegin;
594: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
595: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
596: if (PetscMemoryAccessWrite(access)) {
597: // WRITE or READ_WRITE
598: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
599: v->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
600: }
601: if (a) {
602: PetscCall(CheckPointerMatchesMemType_(*a, mtype));
603: *a = nullptr;
604: }
605: PetscFunctionReturn(PETSC_SUCCESS);
606: }
608: // v->ops->restorearray[read|write] or VecCUPMRestoreArray[Read|Write]()
609: template <device::cupm::DeviceType T, typename D>
610: template <PetscMemType mtype, PetscMemoryAccessMode access>
611: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArray(Vec v, PetscScalar **a) noexcept
612: {
613: PetscDeviceContext dctx;
615: PetscFunctionBegin;
616: PetscCall(GetHandles_(&dctx));
617: PetscCall(D::template RestoreArray<mtype, access>(v, a, dctx));
618: PetscFunctionReturn(PETSC_SUCCESS);
619: }
621: template <device::cupm::DeviceType T, typename D>
622: template <PetscMemoryAccessMode access>
623: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrayAndMemtype(Vec v, PetscScalar **a, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
624: {
625: PetscFunctionBegin;
626: PetscCall(D::template GetArray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
627: if (mtype) *mtype = (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUPM();
628: PetscFunctionReturn(PETSC_SUCCESS);
629: }
631: // v->ops->getarrayandmemtype
632: template <device::cupm::DeviceType T, typename D>
633: template <PetscMemoryAccessMode access>
634: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrayAndMemtype(Vec v, PetscScalar **a, PetscMemType *mtype) noexcept
635: {
636: PetscDeviceContext dctx;
638: PetscFunctionBegin;
639: PetscCall(GetHandles_(&dctx));
640: PetscCall(D::template GetArrayAndMemtype<access>(v, a, mtype, dctx));
641: PetscFunctionReturn(PETSC_SUCCESS);
642: }
644: template <device::cupm::DeviceType T, typename D>
645: template <PetscMemoryAccessMode access>
646: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArrayAndMemtype(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
647: {
648: PetscFunctionBegin;
649: PetscCall(D::template RestoreArray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
650: PetscFunctionReturn(PETSC_SUCCESS);
651: }
653: // v->ops->restorearrayandmemtype
654: template <device::cupm::DeviceType T, typename D>
655: template <PetscMemoryAccessMode access>
656: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArrayAndMemtype(Vec v, PetscScalar **a) noexcept
657: {
658: PetscDeviceContext dctx;
660: PetscFunctionBegin;
661: PetscCall(GetHandles_(&dctx));
662: PetscCall(D::template RestoreArrayAndMemtype<access>(v, a, dctx));
663: PetscFunctionReturn(PETSC_SUCCESS);
664: }
666: // v->ops->placearray or VecCUPMPlaceArray()
667: template <device::cupm::DeviceType T, typename D>
668: template <PetscMemType mtype>
669: inline PetscErrorCode Vec_CUPMBase<T, D>::PlaceArray(Vec v, const PetscScalar *a) noexcept
670: {
671: PetscDeviceContext dctx;
673: PetscFunctionBegin;
674: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
675: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
676: PetscCall(CheckPointerMatchesMemType_(a, mtype));
677: PetscCall(GetHandles_(&dctx));
678: if (PetscMemTypeHost(mtype)) {
679: PetscCall(CopyToHost_(dctx, v));
680: PetscCall(VecPlaceArray_IMPL(v, a));
681: v->offloadmask = PETSC_OFFLOAD_CPU;
682: } else {
683: PetscCall(VecIMPLAllocateCheck_(v));
684: {
685: auto &backup_array = VecIMPLCast(v)->unplacedarray;
687: PetscCheck(!backup_array, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
688: PetscCall(CopyToDevice_(dctx, v));
689: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
690: backup_array = util::exchange(VecCUPMCast(v)->array_d, const_cast<PetscScalar *>(a));
691: // only update the offload mask if we actually assign a pointer
692: if (a) v->offloadmask = PETSC_OFFLOAD_GPU;
693: }
694: }
695: PetscFunctionReturn(PETSC_SUCCESS);
696: }
698: // v->ops->replacearray or VecCUPMReplaceArray()
699: template <device::cupm::DeviceType T, typename D>
700: template <PetscMemType mtype>
701: inline PetscErrorCode Vec_CUPMBase<T, D>::ReplaceArray(Vec v, const PetscScalar *a) noexcept
702: {
703: const auto aptr = const_cast<PetscScalar *>(a);
704: PetscDeviceContext dctx;
706: PetscFunctionBegin;
707: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
708: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
709: PetscCall(CheckPointerMatchesMemType_(a, mtype));
710: PetscCall(GetHandles_(&dctx));
711: if (PetscMemTypeHost(mtype)) {
712: PetscCall(VecIMPLAllocateCheck_(v));
713: {
714: const auto vimpl = VecIMPLCast(v);
715: auto &host_array = vimpl->array_allocated;
717: // make sure the users array has the latest values.
718: // REVIEW ME: why? we're about to free it
719: if (host_array != vimpl->array) PetscCall(CopyToHost_(dctx, v));
720: if (host_array) {
721: const auto useit = UseCUPMHostAlloc(v->pinned_memory);
723: PetscCall(PetscFree(host_array));
724: }
725: host_array = aptr;
726: vimpl->array = host_array;
727: v->pinned_memory = PETSC_FALSE; // REVIEW ME: we can determine this
728: v->offloadmask = PETSC_OFFLOAD_CPU;
729: }
730: } else {
731: PetscCall(VecCUPMAllocateCheck_(v));
732: {
733: const auto vcu = VecCUPMCast(v);
735: PetscCall(ResetAllocatedDevicePtr_(dctx, v, aptr));
736: // don't update the offloadmask if placed pointer is NULL
737: vcu->array_d = vcu->array_allocated_d /* = aptr */;
738: if (aptr) v->offloadmask = PETSC_OFFLOAD_GPU;
739: }
740: }
741: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
742: PetscFunctionReturn(PETSC_SUCCESS);
743: }
745: // v->ops->resetarray or VecCUPMResetArray()
746: template <device::cupm::DeviceType T, typename D>
747: template <PetscMemType mtype>
748: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetArray(Vec v) noexcept
749: {
750: PetscDeviceContext dctx;
752: PetscFunctionBegin;
753: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
754: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
755: PetscCall(GetHandles_(&dctx));
756: // REVIEW ME:
757: // this is wildly inefficient but must be done if we assume that the placed array must have
758: // correct values
759: if (PetscMemTypeHost(mtype)) {
760: PetscCall(CopyToHost_(dctx, v));
761: PetscCall(VecResetArray_IMPL(v));
762: v->offloadmask = PETSC_OFFLOAD_CPU;
763: } else {
764: PetscCall(VecIMPLAllocateCheck_(v));
765: PetscCall(VecCUPMAllocateCheck_(v));
766: {
767: const auto vcu = VecCUPMCast(v);
768: const auto vimpl = VecIMPLCast(v);
769: auto &host_array = vimpl->unplacedarray;
771: PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_DEVICE));
772: PetscCall(CopyToDevice_(dctx, v));
773: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
774: // Need to reset the offloadmask. If we had a stashed pointer we are on the GPU,
775: // otherwise check if the host has a valid pointer. If neither, then we are not
776: // allocated.
777: vcu->array_d = host_array;
778: if (host_array) {
779: host_array = nullptr;
780: v->offloadmask = PETSC_OFFLOAD_GPU;
781: } else if (vimpl->array) {
782: v->offloadmask = PETSC_OFFLOAD_CPU;
783: } else {
784: v->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
785: }
786: }
787: }
788: PetscFunctionReturn(PETSC_SUCCESS);
789: }
791: // v->ops->create
792: template <device::cupm::DeviceType T, typename D>
793: inline PetscErrorCode Vec_CUPMBase<T, D>::Create(Vec v) noexcept
794: {
795: PetscBool alloc_missing;
796: PetscDeviceContext dctx;
798: PetscFunctionBegin;
799: PetscCall(VecCreate_IMPL_Private(v, &alloc_missing));
800: PetscCall(GetHandles_(&dctx));
801: PetscCall(Initialize_CUPMBase(v, alloc_missing, nullptr, nullptr, dctx));
802: PetscFunctionReturn(PETSC_SUCCESS);
803: }
805: // v->ops->destroy
806: template <device::cupm::DeviceType T, typename D>
807: inline PetscErrorCode Vec_CUPMBase<T, D>::Destroy(Vec v) noexcept
808: {
809: PetscFunctionBegin;
810: if (const auto vcu = VecCUPMCast(v)) {
811: PetscDeviceContext dctx;
813: PetscCall(GetHandles_(&dctx));
814: PetscCall(ResetAllocatedDevicePtr_(dctx, v));
815: PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
816: PetscCall(PetscFree(v->spptr));
817: }
818: PetscCall(PetscObjectSAWsViewOff(PetscObjectCast(v)));
819: if (const auto vimpl = VecIMPLCast(v)) {
820: if (auto &array_allocated = vimpl->array_allocated) {
821: const auto useit = UseCUPMHostAlloc(v->pinned_memory);
823: // do this ourselves since we may want to use the cupm functions
824: PetscCall(PetscFree(array_allocated));
825: }
826: }
827: v->pinned_memory = PETSC_FALSE;
828: PetscCall(VecDestroy_IMPL(v));
829: PetscFunctionReturn(PETSC_SUCCESS);
830: }
832: // ================================================================================== //
833: // Common core between Seq and MPI //
835: // VecCreate_CUPM()
836: template <device::cupm::DeviceType T, typename D>
837: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPM(Vec v) noexcept
838: {
839: PetscMPIInt size;
841: PetscFunctionBegin;
842: PetscCallMPI(MPI_Comm_size(PetscObjectComm(PetscObjectCast(v)), &size));
843: PetscCall(VecSetType(v, size > 1 ? VECMPICUPM() : VECSEQCUPM()));
844: PetscFunctionReturn(PETSC_SUCCESS);
845: }
847: // VecCreateCUPM()
848: template <device::cupm::DeviceType T, typename D>
849: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPMBase(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, Vec *v, PetscBool call_set_type, PetscLayout reference) noexcept
850: {
851: PetscFunctionBegin;
852: PetscCall(VecCreate(comm, v));
853: if (reference) PetscCall(PetscLayoutReference(reference, &(*v)->map));
854: PetscCall(VecSetSizes(*v, n, N));
855: if (bs) PetscCall(VecSetBlockSize(*v, bs));
856: if (call_set_type) PetscCall(VecSetType(*v, VECIMPLCUPM()));
857: PetscFunctionReturn(PETSC_SUCCESS);
858: }
860: // VecCreateIMPL_CUPM(), called through v->ops->create
861: template <device::cupm::DeviceType T, typename D>
862: inline PetscErrorCode Vec_CUPMBase<T, D>::Initialize_CUPMBase(Vec v, PetscBool allocate_missing, PetscScalar *host_array, PetscScalar *device_array, PetscDeviceContext dctx) noexcept
863: {
864: PetscFunctionBegin;
865: // REVIEW ME: perhaps not needed
866: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
867: PetscCall(PetscObjectChangeTypeName(PetscObjectCast(v), VECIMPLCUPM()));
868: PetscCall(D::BindToCPU(v, PETSC_FALSE));
869: if (device_array) {
870: PetscCall(CheckPointerMatchesMemType_(device_array, PETSC_MEMTYPE_CUPM()));
871: PetscCall(VecCUPMAllocateCheck_(v));
872: VecCUPMCast(v)->array_d = device_array;
873: }
874: if (host_array) {
875: PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_HOST));
876: VecIMPLCast(v)->array = host_array;
877: }
878: if (allocate_missing) {
879: PetscCall(DeviceAllocateCheck_(dctx, v));
880: PetscCall(HostAllocateCheck_(dctx, v));
881: // REVIEW ME: junchao, is this needed with new calloc() branch? VecSet() will call
882: // set() for reference
883: // calls device-version
884: PetscCall(VecSet(v, 0));
885: // zero the host while device is underway
886: PetscCall(PetscArrayzero(VecIMPLCast(v)->array, v->map->n));
887: v->offloadmask = PETSC_OFFLOAD_BOTH;
888: } else {
889: if (host_array) {
890: v->offloadmask = device_array ? PETSC_OFFLOAD_BOTH : PETSC_OFFLOAD_CPU;
891: } else {
892: v->offloadmask = device_array ? PETSC_OFFLOAD_GPU : PETSC_OFFLOAD_UNALLOCATED;
893: }
894: }
895: PetscFunctionReturn(PETSC_SUCCESS);
896: }
898: // v->ops->duplicate
899: template <device::cupm::DeviceType T, typename D>
900: template <typename SetupFunctionT>
901: inline PetscErrorCode Vec_CUPMBase<T, D>::Duplicate_CUPMBase(Vec v, Vec *y, PetscDeviceContext dctx, SetupFunctionT &&DerivedCreateIMPLCUPM_Async) noexcept
902: {
903: // if the derived setup is the default no_op then we should call VecSetType()
904: constexpr auto call_set_type = static_cast<PetscBool>(std::is_same<SetupFunctionT, no_op>::value);
905: const auto vobj = PetscObjectCast(v);
906: const auto map = v->map;
907: PetscInt bs;
909: PetscFunctionBegin;
910: PetscCall(VecGetBlockSize(v, &bs));
911: PetscCall(Create_CUPMBase(PetscObjectComm(vobj), bs, map->n, map->N, y, call_set_type, map));
912: // Derived class can set up the remainder of the data structures here
913: PetscCall(DerivedCreateIMPLCUPM_Async(*y));
914: // If the other vector is bound to CPU then the memcpy of the ops struct will give the
915: // duplicated vector the host "getarray" function which does not lazily allocate the array
916: // (as it is assumed to always exist). So we force allocation here, before we overwrite the
917: // ops
918: if (v->boundtocpu) PetscCall(HostAllocateCheck_(dctx, *y));
919: // in case the user has done some VecSetOps() tomfoolery
920: PetscCall(PetscArraycpy((*y)->ops, v->ops, 1));
921: {
922: const auto yobj = PetscObjectCast(*y);
924: PetscCall(PetscObjectListDuplicate(vobj->olist, &yobj->olist));
925: PetscCall(PetscFunctionListDuplicate(vobj->qlist, &yobj->qlist));
926: }
927: (*y)->stash.donotstash = v->stash.donotstash;
928: (*y)->stash.ignorenegidx = v->stash.ignorenegidx;
929: (*y)->map->bs = std::abs(v->map->bs);
930: (*y)->bstash.bs = v->bstash.bs;
931: PetscFunctionReturn(PETSC_SUCCESS);
932: }
934: #define VecSetOp_CUPM(op_name, op_host, ...) \
935: do { \
936: if (usehost) { \
937: v->ops->op_name = op_host; \
938: } else { \
939: v->ops->op_name = __VA_ARGS__; \
940: } \
941: } while (0)
943: // v->ops->bindtocpu
944: template <device::cupm::DeviceType T, typename D>
945: inline PetscErrorCode Vec_CUPMBase<T, D>::BindToCPU_CUPMBase(Vec v, PetscBool usehost, PetscDeviceContext dctx) noexcept
946: {
947: PetscFunctionBegin;
948: v->boundtocpu = usehost;
949: if (usehost) PetscCall(CopyToHost_(dctx, v));
950: PetscCall(PetscStrFreeAllocpy(usehost ? PETSCRANDER48 : PETSCDEVICERAND(), &v->defaultrandtype));
952: // set the base functions that are guaranteed to be the same for both
953: v->ops->duplicate = D::Duplicate;
954: v->ops->create = D::Create;
955: v->ops->destroy = D::Destroy;
956: v->ops->bindtocpu = D::BindToCPU;
957: // Note that setting these to NULL on host breaks convergence in certain areas. I don't know
958: // why, and I don't know how, but it is IMPERATIVE these are set as such!
959: v->ops->replacearray = D::template ReplaceArray<PETSC_MEMTYPE_HOST>;
960: v->ops->restorearray = D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>;
962: // set device-only common functions
963: VecSetOp_CUPM(dotnorm2, nullptr, D::DotNorm2);
964: VecSetOp_CUPM(getarray, nullptr, D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
965: VecSetOp_CUPM(getarraywrite, nullptr, D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
966: VecSetOp_CUPM(restorearraywrite, nullptr, D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
968: VecSetOp_CUPM(getarrayread, nullptr, [](Vec v, const PetscScalar **a) { return D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
969: VecSetOp_CUPM(restorearrayread, nullptr, [](Vec v, const PetscScalar **a) { return D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
971: VecSetOp_CUPM(getarrayandmemtype, nullptr, D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
972: VecSetOp_CUPM(restorearrayandmemtype, nullptr, D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
974: VecSetOp_CUPM(getarraywriteandmemtype, nullptr, D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_WRITE>);
975: VecSetOp_CUPM(restorearraywriteandmemtype, nullptr, [](Vec v, PetscScalar **a, PetscMemType *) { return D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_WRITE>(v, a); });
977: VecSetOp_CUPM(getarrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a, PetscMemType *m) { return D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a), m); });
978: VecSetOp_CUPM(restorearrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a) { return D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
980: // set the functions that are always sequential
981: using VecSeq_T = VecSeq_CUPM<T>;
982: VecSetOp_CUPM(scale, VecScale_Seq, VecSeq_T::Scale);
983: VecSetOp_CUPM(copy, VecCopy_Seq, VecSeq_T::Copy);
984: VecSetOp_CUPM(set, VecSet_Seq, VecSeq_T::Set);
985: VecSetOp_CUPM(swap, VecSwap_Seq, VecSeq_T::Swap);
986: VecSetOp_CUPM(axpy, VecAXPY_Seq, VecSeq_T::AXPY);
987: VecSetOp_CUPM(axpby, VecAXPBY_Seq, VecSeq_T::AXPBY);
988: VecSetOp_CUPM(maxpy, VecMAXPY_Seq, VecSeq_T::MAXPY);
989: VecSetOp_CUPM(aypx, VecAYPX_Seq, VecSeq_T::AYPX);
990: VecSetOp_CUPM(waxpy, VecWAXPY_Seq, VecSeq_T::WAXPY);
991: VecSetOp_CUPM(axpbypcz, VecAXPBYPCZ_Seq, VecSeq_T::AXPBYPCZ);
992: VecSetOp_CUPM(pointwisemult, VecPointwiseMult_Seq, VecSeq_T::PointwiseMult);
993: VecSetOp_CUPM(pointwisedivide, VecPointwiseDivide_Seq, VecSeq_T::PointwiseDivide);
994: VecSetOp_CUPM(setrandom, VecSetRandom_Seq, VecSeq_T::SetRandom);
995: VecSetOp_CUPM(dot_local, VecDot_Seq, VecSeq_T::Dot);
996: VecSetOp_CUPM(tdot_local, VecTDot_Seq, VecSeq_T::TDot);
997: VecSetOp_CUPM(norm_local, VecNorm_Seq, VecSeq_T::Norm);
998: VecSetOp_CUPM(mdot_local, VecMDot_Seq, VecSeq_T::MDot);
999: VecSetOp_CUPM(reciprocal, VecReciprocal_Default, VecSeq_T::Reciprocal);
1000: VecSetOp_CUPM(shift, nullptr, VecSeq_T::Shift);
1001: VecSetOp_CUPM(getlocalvector, nullptr, VecSeq_T::template GetLocalVector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1002: VecSetOp_CUPM(restorelocalvector, nullptr, VecSeq_T::template RestoreLocalVector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1003: VecSetOp_CUPM(getlocalvectorread, nullptr, VecSeq_T::template GetLocalVector<PETSC_MEMORY_ACCESS_READ>);
1004: VecSetOp_CUPM(restorelocalvectorread, nullptr, VecSeq_T::template RestoreLocalVector<PETSC_MEMORY_ACCESS_READ>);
1005: VecSetOp_CUPM(sum, nullptr, VecSeq_T::Sum);
1006: PetscFunctionReturn(PETSC_SUCCESS);
1007: }
1009: // Called from VecGetSubVector()
1010: template <device::cupm::DeviceType T, typename D>
1011: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrays_CUPMBase(Vec v, const PetscScalar **host_array, const PetscScalar **device_array, PetscOffloadMask *mask, PetscDeviceContext dctx) noexcept
1012: {
1013: PetscFunctionBegin;
1014: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1015: if (host_array) {
1016: PetscCall(HostAllocateCheck_(dctx, v));
1017: *host_array = VecIMPLCast(v)->array;
1018: }
1019: if (device_array) {
1020: PetscCall(DeviceAllocateCheck_(dctx, v));
1021: *device_array = VecCUPMCast(v)->array_d;
1022: }
1023: if (mask) *mask = v->offloadmask;
1024: PetscFunctionReturn(PETSC_SUCCESS);
1025: }
1027: template <device::cupm::DeviceType T, typename D>
1028: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetPreallocationCOO_CUPMBase(Vec v, PetscDeviceContext dctx) noexcept
1029: {
1030: PetscFunctionBegin;
1031: if (const auto vcu = VecCUPMCast(v)) {
1032: cupmStream_t stream;
1033: // clang-format off
1034: const auto cntptrs = util::make_array(
1035: std::ref(vcu->jmap1_d),
1036: std::ref(vcu->perm1_d),
1037: std::ref(vcu->imap2_d),
1038: std::ref(vcu->jmap2_d),
1039: std::ref(vcu->perm2_d),
1040: std::ref(vcu->Cperm_d)
1041: );
1042: // clang-format on
1044: PetscCall(GetHandlesFrom_(dctx, &stream));
1045: for (auto &&ptr : cntptrs) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1046: for (auto &&ptr : util::make_array(std::ref(vcu->sendbuf_d), std::ref(vcu->recvbuf_d))) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1047: }
1048: PetscFunctionReturn(PETSC_SUCCESS);
1049: }
1051: template <device::cupm::DeviceType T, typename D>
1052: template <std::size_t NCount, std::size_t NScal>
1053: inline PetscErrorCode Vec_CUPMBase<T, D>::SetPreallocationCOO_CUPMBase(Vec v, PetscCount, const PetscInt[], PetscDeviceContext dctx, const std::array<CooPair<PetscCount>, NCount> &extra_cntptrs, const std::array<CooPair<PetscScalar>, NScal> &bufptrs) noexcept
1054: {
1055: PetscFunctionBegin;
1056: PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1057: // need to instantiate the private pointer if not already
1058: PetscCall(VecCUPMAllocateCheck_(v));
1059: {
1060: const auto vimpl = VecIMPLCast(v);
1061: const auto vcu = VecCUPMCast(v);
1062: // clang-format off
1063: const auto cntptrs = util::concat_array(
1064: util::make_array(
1065: make_coo_pair(vcu->jmap1_d, vimpl->jmap1, v->map->n + 1),
1066: make_coo_pair(vcu->perm1_d, vimpl->perm1, vimpl->tot1)
1067: ),
1068: extra_cntptrs
1069: );
1070: // clang-format on
1071: cupmStream_t stream;
1073: PetscCall(GetHandlesFrom_(dctx, &stream));
1074: // allocate
1075: for (auto &elem : cntptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1076: for (auto &elem : bufptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1077: // copy
1078: for (const auto &elem : cntptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1079: for (const auto &elem : bufptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1080: }
1081: PetscFunctionReturn(PETSC_SUCCESS);
1082: }
1084: #define PETSC_VEC_CUPM_BASE_CLASS_HEADER(name, Tp, ...) \
1085: PETSC_CUPMOBJECT_HEADER(Tp); \
1086: using name = ::Petsc::vec::cupm::impl::Vec_CUPMBase<Tp, __VA_ARGS__>; \
1087: friend name; \
1088: /* introspection */ \
1089: using name::VecCUPMCast; \
1090: using name::VecIMPLCast; \
1091: using name::VECIMPLCUPM; \
1092: using name::VECSEQCUPM; \
1093: using name::VECMPICUPM; \
1094: using name::VECCUPM; \
1095: using name::VecView_Debug; \
1096: /* utility */ \
1097: using typename name::Vec_CUPM; \
1098: using name::VecCUPMAllocateCheck_; \
1099: using name::VecIMPLAllocateCheck_; \
1100: using name::HostAllocateCheck_; \
1101: using name::DeviceAllocateCheck_; \
1102: using name::CopyToDevice_; \
1103: using name::CopyToHost_; \
1104: using name::Create; \
1105: using name::Destroy; \
1106: using name::GetArray; \
1107: using name::RestoreArray; \
1108: using name::GetArrayAndMemtype; \
1109: using name::RestoreArrayAndMemtype; \
1110: using name::PlaceArray; \
1111: using name::ReplaceArray; \
1112: using name::ResetArray; \
1113: /* base functions */ \
1114: using name::Create_CUPMBase; \
1115: using name::Initialize_CUPMBase; \
1116: using name::Duplicate_CUPMBase; \
1117: using name::BindToCPU_CUPMBase; \
1118: using name::Create_CUPM; \
1119: using name::DeviceArrayRead; \
1120: using name::DeviceArrayWrite; \
1121: using name::DeviceArrayReadWrite; \
1122: using name::HostArrayRead; \
1123: using name::HostArrayWrite; \
1124: using name::HostArrayReadWrite; \
1125: using name::ResetPreallocationCOO_CUPMBase; \
1126: using name::SetPreallocationCOO_CUPMBase
1128: } // namespace impl
1130: } // namespace cupm
1132: } // namespace vec
1134: } // namespace Petsc
1136: #endif // __cplusplus && PetscDefined(HAVE_DEVICE)
1138: #endif // PETSCVECCUPMIMPL_H