Actual source code: mpiaijhipsparse.hip.cpp

  1: /* Portions of this code are under:
  2:    Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  3: */
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
  6: #include <../src/mat/impls/aij/mpi/mpihipsparse/mpihipsparsematimpl.h>
  7: #include <thrust/advance.h>
  8: #include <thrust/partition.h>
  9: #include <thrust/sort.h>
 10: #include <thrust/unique.h>
 11: #include <petscsf.h>

 13: struct VecHIPEquals {
 14:   template <typename Tuple>
 15:   __host__ __device__ void operator()(Tuple t)
 16:   {
 17:     thrust::get<1>(t) = thrust::get<0>(t);
 18:   }
 19: };

 21: static PetscErrorCode MatResetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat)
 22: {
 23:   auto *aij             = static_cast<Mat_MPIAIJ *>(mat->data);
 24:   auto *hipsparseStruct = static_cast<Mat_MPIAIJHIPSPARSE *>(aij->spptr);

 26:   PetscFunctionBegin;
 27:   if (!hipsparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
 28:   if (hipsparseStruct->use_extended_coo) {
 29:     PetscCallHIP(hipFree(hipsparseStruct->Ajmap1_d));
 30:     PetscCallHIP(hipFree(hipsparseStruct->Aperm1_d));
 31:     PetscCallHIP(hipFree(hipsparseStruct->Bjmap1_d));
 32:     PetscCallHIP(hipFree(hipsparseStruct->Bperm1_d));
 33:     PetscCallHIP(hipFree(hipsparseStruct->Aimap2_d));
 34:     PetscCallHIP(hipFree(hipsparseStruct->Ajmap2_d));
 35:     PetscCallHIP(hipFree(hipsparseStruct->Aperm2_d));
 36:     PetscCallHIP(hipFree(hipsparseStruct->Bimap2_d));
 37:     PetscCallHIP(hipFree(hipsparseStruct->Bjmap2_d));
 38:     PetscCallHIP(hipFree(hipsparseStruct->Bperm2_d));
 39:     PetscCallHIP(hipFree(hipsparseStruct->Cperm1_d));
 40:     PetscCallHIP(hipFree(hipsparseStruct->sendbuf_d));
 41:     PetscCallHIP(hipFree(hipsparseStruct->recvbuf_d));
 42:   }
 43:   hipsparseStruct->use_extended_coo = PETSC_FALSE;
 44:   delete hipsparseStruct->coo_p;
 45:   delete hipsparseStruct->coo_pw;
 46:   hipsparseStruct->coo_p  = nullptr;
 47:   hipsparseStruct->coo_pw = nullptr;
 48:   PetscFunctionReturn(PETSC_SUCCESS);
 49: }

 51: static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
 52: {
 53:   Mat_MPIAIJ          *a    = (Mat_MPIAIJ *)A->data;
 54:   Mat_MPIAIJHIPSPARSE *cusp = (Mat_MPIAIJHIPSPARSE *)a->spptr;
 55:   PetscInt             n    = cusp->coo_nd + cusp->coo_no;

 57:   PetscFunctionBegin;
 58:   if (cusp->coo_p && v) {
 59:     thrust::device_ptr<const PetscScalar> d_v;
 60:     THRUSTARRAY                          *w = NULL;

 62:     if (isHipMem(v)) {
 63:       d_v = thrust::device_pointer_cast(v);
 64:     } else {
 65:       w = new THRUSTARRAY(n);
 66:       w->assign(v, v + n);
 67:       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
 68:       d_v = w->data();
 69:     }

 71:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
 72:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
 73:     PetscCall(PetscLogGpuTimeBegin());
 74:     thrust::for_each(zibit, zieit, VecHIPEquals());
 75:     PetscCall(PetscLogGpuTimeEnd());
 76:     delete w;
 77:     PetscCall(MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
 78:     PetscCall(MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
 79:   } else {
 80:     PetscCall(MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->A, v, imode));
 81:     PetscCall(MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(a->B, v ? v + cusp->coo_nd : nullptr, imode));
 82:   }
 83:   PetscFunctionReturn(PETSC_SUCCESS);
 84: }

 86: template <typename Tuple>
 87: struct IsNotOffDiagT {
 88:   PetscInt _cstart, _cend;

 90:   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 91:   __host__ __device__ bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
 92: };

 94: struct IsOffDiag {
 95:   PetscInt _cstart, _cend;

 97:   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 98:   __host__ __device__ bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
 99: };

101: struct GlobToLoc {
102:   PetscInt _start;

104:   GlobToLoc(PetscInt start) : _start(start) { }
105:   __host__ __device__ PetscInt operator()(const PetscInt &c) { return c - _start; }
106: };

108: static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109: {
110:   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
111:   Mat_MPIAIJHIPSPARSE   *cusp = (Mat_MPIAIJHIPSPARSE *)b->spptr;
112:   PetscInt               N, *jj;
113:   size_t                 noff = 0;
114:   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115:   THRUSTINTARRAY         d_j(n);
116:   ISLocalToGlobalMapping l2g;

118:   PetscFunctionBegin;
119:   PetscCall(MatDestroy(&b->A));
120:   PetscCall(MatDestroy(&b->B));

122:   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
123:   d_i.assign(coo_i, coo_i + n);
124:   d_j.assign(coo_j, coo_j + n);
125:   delete cusp->coo_p;
126:   delete cusp->coo_pw;
127:   cusp->coo_p  = NULL;
128:   cusp->coo_pw = NULL;
129:   PetscCall(PetscLogGpuTimeBegin());
130:   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
131:   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
132:   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
133:     cusp->coo_p  = new THRUSTINTARRAY(n);
134:     cusp->coo_pw = new THRUSTARRAY(n);
135:     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
136:     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
137:     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
138:     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
139:     firstoffd  = mzipp.get_iterator_tuple().get<1>();
140:   }
141:   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
142:   cusp->coo_no = thrust::distance(firstoffd, d_j.end());

144:   /* from global to local */
145:   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
146:   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
147:   PetscCall(PetscLogGpuTimeEnd());

149:   /* copy offdiag column indices to map on the CPU */
150:   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
151:   PetscCallHIP(hipMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), hipMemcpyDeviceToHost));
152:   auto o_j = d_j.begin();
153:   PetscCall(PetscLogGpuTimeBegin());
154:   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
155:   thrust::sort(thrust::device, o_j, d_j.end());
156:   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
157:   PetscCall(PetscLogGpuTimeEnd());
158:   noff = thrust::distance(o_j, wit);
159:   PetscCall(PetscMalloc1(noff, &b->garray));
160:   PetscCallHIP(hipMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), hipMemcpyDeviceToHost));
161:   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
162:   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
163:   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
164:   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
165:   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
166:   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
167:   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
168:   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
169:   PetscCall(MatSetType(b->A, MATSEQAIJHIPSPARSE));
170:   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
171:   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
172:   PetscCall(MatSetType(b->B, MATSEQAIJHIPSPARSE));

174:   /* GPU memory, hipsparse specific call handles it internally */
175:   PetscCall(MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
176:   PetscCall(MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
177:   PetscCall(PetscFree(jj));

179:   PetscCall(MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, cusp->diagGPUMatFormat));
180:   PetscCall(MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, cusp->offdiagGPUMatFormat));
181:   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
182:   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
183:   PetscCall(MatSetUpMultiply_MPIAIJ(B));
184:   PetscFunctionReturn(PETSC_SUCCESS);
185: }

187: static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
188: {
189:   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
190:   Mat_MPIAIJHIPSPARSE *mpidev;
191:   PetscBool            coo_basic = PETSC_TRUE;
192:   PetscMemType         mtype     = PETSC_MEMTYPE_DEVICE;
193:   PetscInt             rstart, rend;

195:   PetscFunctionBegin;
196:   PetscCall(PetscFree(mpiaij->garray));
197:   PetscCall(VecDestroy(&mpiaij->lvec));
198: #if defined(PETSC_USE_CTABLE)
199:   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
200: #else
201:   PetscCall(PetscFree(mpiaij->colmap));
202: #endif
203:   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
204:   mat->assembled     = PETSC_FALSE;
205:   mat->was_assembled = PETSC_FALSE;
206:   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
207:   PetscCall(MatResetPreallocationCOO_MPIAIJHIPSPARSE(mat));
208:   if (coo_i) {
209:     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
210:     PetscCall(PetscGetMemType(coo_i, &mtype));
211:     if (PetscMemTypeHost(mtype)) {
212:       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
213:         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
214:           coo_basic = PETSC_FALSE;
215:           break;
216:         }
217:       }
218:     }
219:   }
220:   /* All ranks must agree on the value of coo_basic */
221:   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
222:   if (coo_basic) {
223:     PetscCall(MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic(mat, coo_n, coo_i, coo_j));
224:   } else {
225:     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
226:     mat->offloadmask = PETSC_OFFLOAD_CPU;
227:     /* creates the GPU memory */
228:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->A));
229:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->B));
230:     mpidev                   = static_cast<Mat_MPIAIJHIPSPARSE *>(mpiaij->spptr);
231:     mpidev->use_extended_coo = PETSC_TRUE;

233:     PetscCallHIP(hipMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
234:     PetscCallHIP(hipMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));

236:     PetscCallHIP(hipMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
237:     PetscCallHIP(hipMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));

239:     PetscCallHIP(hipMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
240:     PetscCallHIP(hipMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
241:     PetscCallHIP(hipMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));

243:     PetscCallHIP(hipMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
244:     PetscCallHIP(hipMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
245:     PetscCallHIP(hipMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));

247:     PetscCallHIP(hipMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
248:     PetscCallHIP(hipMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
249:     PetscCallHIP(hipMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));

251:     PetscCallHIP(hipMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
252:     PetscCallHIP(hipMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), hipMemcpyHostToDevice));

254:     PetscCallHIP(hipMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
255:     PetscCallHIP(hipMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), hipMemcpyHostToDevice));

257:     PetscCallHIP(hipMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
258:     PetscCallHIP(hipMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
259:     PetscCallHIP(hipMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), hipMemcpyHostToDevice));

261:     PetscCallHIP(hipMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
262:     PetscCallHIP(hipMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
263:     PetscCallHIP(hipMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), hipMemcpyHostToDevice));

265:     PetscCallHIP(hipMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), hipMemcpyHostToDevice));
266:   }
267:   PetscFunctionReturn(PETSC_SUCCESS);
268: }

270: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
271: {
272:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
273:   const PetscCount grid_size = gridDim.x * blockDim.x;
274:   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
275: }

277: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
278: {
279:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
280:   const PetscCount grid_size = gridDim.x * blockDim.x;
281:   for (; i < Annz + Bnnz; i += grid_size) {
282:     PetscScalar sum = 0.0;
283:     if (i < Annz) {
284:       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
285:       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
286:     } else {
287:       i -= Annz;
288:       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
289:       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
290:     }
291:   }
292: }

294: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
295: {
296:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
297:   const PetscCount grid_size = gridDim.x * blockDim.x;
298:   for (; i < Annz2 + Bnnz2; i += grid_size) {
299:     if (i < Annz2) {
300:       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
301:     } else {
302:       i -= Annz2;
303:       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
304:     }
305:   }
306: }

308: static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
309: {
310:   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
311:   Mat_MPIAIJHIPSPARSE *mpidev = static_cast<Mat_MPIAIJHIPSPARSE *>(mpiaij->spptr);
312:   Mat                  A = mpiaij->A, B = mpiaij->B;
313:   PetscCount           Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
314:   PetscScalar         *Aa, *Ba = NULL;
315:   PetscScalar         *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
316:   const PetscScalar   *v1     = v;
317:   const PetscCount    *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
318:   const PetscCount    *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
319:   const PetscCount    *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
320:   const PetscCount    *Cperm1 = mpidev->Cperm1_d;
321:   PetscMemType         memtype;

323:   PetscFunctionBegin;
324:   if (mpidev->use_extended_coo) {
325:     PetscMPIInt size;

327:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
328:     PetscCall(PetscGetMemType(v, &memtype));
329:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
330:       PetscCallHIP(hipMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
331:       PetscCallHIP(hipMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), hipMemcpyHostToDevice));
332:     }

334:     if (imode == INSERT_VALUES) {
335:       PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
336:       PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(B, &Ba));
337:     } else {
338:       PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa)); /* read & write matrix values */
339:       PetscCall(MatSeqAIJHIPSPARSEGetArray(B, &Ba));
340:     }

342:     /* Pack entries to be sent to remote */
343:     if (mpiaij->sendlen) {
344:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatPackCOOValues), dim3((mpiaij->sendlen + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, mpiaij->sendlen, Cperm1, vsend);
345:       PetscCallHIP(hipPeekAtLastError());
346:     }

348:     /* Send remote entries to their owner and overlap the communication with local computation */
349:     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_HIP, vsend, PETSC_MEMTYPE_HIP, v2, MPI_REPLACE));
350:     /* Add local entries to A and B */
351:     if (Annz + Bnnz > 0) {
352:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddLocalCOOValues), dim3((Annz + Bnnz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
353:       PetscCallHIP(hipPeekAtLastError());
354:     }
355:     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));

357:     /* Add received remote entries to A and B */
358:     if (Annz2 + Bnnz2 > 0) {
359:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddRemoteCOOValues), dim3((Annz2 + Bnnz2 + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
360:       PetscCallHIP(hipPeekAtLastError());
361:     }

363:     if (imode == INSERT_VALUES) {
364:       PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
365:       PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(B, &Ba));
366:     } else {
367:       PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
368:       PetscCall(MatSeqAIJHIPSPARSERestoreArray(B, &Ba));
369:     }
370:     if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
371:   } else {
372:     PetscCall(MatSetValuesCOO_MPIAIJHIPSPARSE_Basic(mat, v, imode));
373:   }
374:   mat->offloadmask = PETSC_OFFLOAD_GPU;
375:   PetscFunctionReturn(PETSC_SUCCESS);
376: }

378: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
379: {
380:   Mat             Ad, Ao;
381:   const PetscInt *cmap;

383:   PetscFunctionBegin;
384:   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
385:   PetscCall(MatSeqAIJHIPSPARSEMergeMats(Ad, Ao, scall, A_loc));
386:   if (glob) {
387:     PetscInt cst, i, dn, on, *gidx;

389:     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
390:     PetscCall(MatGetLocalSize(Ao, NULL, &on));
391:     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
392:     PetscCall(PetscMalloc1(dn + on, &gidx));
393:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
394:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
395:     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
396:   }
397:   PetscFunctionReturn(PETSC_SUCCESS);
398: }

400: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
401: {
402:   Mat_MPIAIJ          *b               = (Mat_MPIAIJ *)B->data;
403:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)b->spptr;
404:   PetscInt             i;

406:   PetscFunctionBegin;
407:   PetscCall(PetscLayoutSetUp(B->rmap));
408:   PetscCall(PetscLayoutSetUp(B->cmap));
409:   if (PetscDefined(USE_DEBUG) && d_nnz) {
410:     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
411:   }
412:   if (PetscDefined(USE_DEBUG) && o_nnz) {
413:     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
414:   }
415: #if defined(PETSC_USE_CTABLE)
416:   PetscCall(PetscHMapIDestroy(&b->colmap));
417: #else
418:   PetscCall(PetscFree(b->colmap));
419: #endif
420:   PetscCall(PetscFree(b->garray));
421:   PetscCall(VecDestroy(&b->lvec));
422:   PetscCall(VecScatterDestroy(&b->Mvctx));
423:   /* Because the B will have been resized we simply destroy it and create a new one each time */
424:   PetscCall(MatDestroy(&b->B));
425:   if (!b->A) {
426:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
427:     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
428:   }
429:   if (!b->B) {
430:     PetscMPIInt size;
431:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
432:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
433:     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
434:   }
435:   PetscCall(MatSetType(b->A, MATSEQAIJHIPSPARSE));
436:   PetscCall(MatSetType(b->B, MATSEQAIJHIPSPARSE));
437:   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
438:   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
439:   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
440:   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
441:   PetscCall(MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, hipsparseStruct->diagGPUMatFormat));
442:   PetscCall(MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, hipsparseStruct->offdiagGPUMatFormat));
443:   B->preallocated = PETSC_TRUE;
444:   PetscFunctionReturn(PETSC_SUCCESS);
445: }

447: PetscErrorCode MatMult_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
448: {
449:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

451:   PetscFunctionBegin;
452:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
453:   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
454:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
455:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
456:   PetscFunctionReturn(PETSC_SUCCESS);
457: }

459: PetscErrorCode MatZeroEntries_MPIAIJHIPSPARSE(Mat A)
460: {
461:   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;

463:   PetscFunctionBegin;
464:   PetscCall(MatZeroEntries(l->A));
465:   PetscCall(MatZeroEntries(l->B));
466:   PetscFunctionReturn(PETSC_SUCCESS);
467: }

469: PetscErrorCode MatMultAdd_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
470: {
471:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

473:   PetscFunctionBegin;
474:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
475:   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
476:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
477:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
478:   PetscFunctionReturn(PETSC_SUCCESS);
479: }

481: PetscErrorCode MatMultTranspose_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
482: {
483:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

485:   PetscFunctionBegin;
486:   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
487:   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
488:   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489:   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
490:   PetscFunctionReturn(PETSC_SUCCESS);
491: }

493: PetscErrorCode MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
494: {
495:   Mat_MPIAIJ          *a               = (Mat_MPIAIJ *)A->data;
496:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;

498:   PetscFunctionBegin;
499:   switch (op) {
500:   case MAT_HIPSPARSE_MULT_DIAG:
501:     hipsparseStruct->diagGPUMatFormat = format;
502:     break;
503:   case MAT_HIPSPARSE_MULT_OFFDIAG:
504:     hipsparseStruct->offdiagGPUMatFormat = format;
505:     break;
506:   case MAT_HIPSPARSE_ALL:
507:     hipsparseStruct->diagGPUMatFormat    = format;
508:     hipsparseStruct->offdiagGPUMatFormat = format;
509:     break;
510:   default:
511:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. Only MAT_HIPSPARSE_MULT_DIAG, MAT_HIPSPARSE_MULT_DIAG, and MAT_HIPSPARSE_MULT_ALL are currently supported.", op);
512:   }
513:   PetscFunctionReturn(PETSC_SUCCESS);
514: }

516: PetscErrorCode MatSetFromOptions_MPIAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
517: {
518:   MatHIPSPARSEStorageFormat format;
519:   PetscBool                 flg;
520:   Mat_MPIAIJ               *a               = (Mat_MPIAIJ *)A->data;
521:   Mat_MPIAIJHIPSPARSE      *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;

523:   PetscFunctionBegin;
524:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJHIPSPARSE options");
525:   if (A->factortype == MAT_FACTOR_NONE) {
526:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
527:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_DIAG, format));
528:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
529:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_OFFDIAG, format));
530:     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
531:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
532:   }
533:   PetscOptionsHeadEnd();
534:   PetscFunctionReturn(PETSC_SUCCESS);
535: }

537: PetscErrorCode MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A, MatAssemblyType mode)
538: {
539:   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)A->data;
540:   Mat_MPIAIJHIPSPARSE *cusp   = (Mat_MPIAIJHIPSPARSE *)mpiaij->spptr;
541:   PetscObjectState     onnz   = A->nonzerostate;

543:   PetscFunctionBegin;
544:   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
545:   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQHIP));
546:   if (onnz != A->nonzerostate && cusp->deviceMat) {
547:     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;

549:     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
550:     PetscCall(PetscNew(&h_mat));
551:     PetscCallHIP(hipMemcpy(h_mat, d_mat, sizeof(*d_mat), hipMemcpyDeviceToHost));
552:     PetscCallHIP(hipFree(h_mat->colmap));
553:     if (h_mat->allocated_indices) {
554:       PetscCallHIP(hipFree(h_mat->diag.i));
555:       PetscCallHIP(hipFree(h_mat->diag.j));
556:       if (h_mat->offdiag.j) {
557:         PetscCallHIP(hipFree(h_mat->offdiag.i));
558:         PetscCallHIP(hipFree(h_mat->offdiag.j));
559:       }
560:     }
561:     PetscCallHIP(hipFree(d_mat));
562:     PetscCall(PetscFree(h_mat));
563:     cusp->deviceMat = NULL;
564:   }
565:   PetscFunctionReturn(PETSC_SUCCESS);
566: }

568: PetscErrorCode MatDestroy_MPIAIJHIPSPARSE(Mat A)
569: {
570:   Mat_MPIAIJ          *aij             = (Mat_MPIAIJ *)A->data;
571:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)aij->spptr;

573:   PetscFunctionBegin;
574:   PetscCheck(hipsparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
575:   if (hipsparseStruct->deviceMat) {
576:     PetscSplitCSRDataStructure d_mat = hipsparseStruct->deviceMat, h_mat;

578:     PetscCall(PetscInfo(A, "Have device matrix\n"));
579:     PetscCall(PetscNew(&h_mat));
580:     PetscCallHIP(hipMemcpy(h_mat, d_mat, sizeof(*d_mat), hipMemcpyDeviceToHost));
581:     PetscCallHIP(hipFree(h_mat->colmap));
582:     if (h_mat->allocated_indices) {
583:       PetscCallHIP(hipFree(h_mat->diag.i));
584:       PetscCallHIP(hipFree(h_mat->diag.j));
585:       if (h_mat->offdiag.j) {
586:         PetscCallHIP(hipFree(h_mat->offdiag.i));
587:         PetscCallHIP(hipFree(h_mat->offdiag.j));
588:       }
589:     }
590:     PetscCallHIP(hipFree(d_mat));
591:     PetscCall(PetscFree(h_mat));
592:   }
593:   /* Free COO */
594:   PetscCall(MatResetPreallocationCOO_MPIAIJHIPSPARSE(A));
595:   PetscCallCXX(delete hipsparseStruct);
596:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
597:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
598:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
599:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
600:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
601:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", NULL));
602:   PetscCall(MatDestroy_MPIAIJ(A));
603:   PetscFunctionReturn(PETSC_SUCCESS);
604: }

606: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
607: {
608:   Mat_MPIAIJ *a;
609:   Mat         A;

611:   PetscFunctionBegin;
612:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
613:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
614:   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
615:   A             = *newmat;
616:   A->boundtocpu = PETSC_FALSE;
617:   PetscCall(PetscFree(A->defaultvectype));
618:   PetscCall(PetscStrallocpy(VECHIP, &A->defaultvectype));

620:   a = (Mat_MPIAIJ *)A->data;
621:   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJHIPSPARSE));
622:   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJHIPSPARSE));
623:   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQHIP));

625:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJHIPSPARSE);

627:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJHIPSPARSE;
628:   A->ops->mult                  = MatMult_MPIAIJHIPSPARSE;
629:   A->ops->multadd               = MatMultAdd_MPIAIJHIPSPARSE;
630:   A->ops->multtranspose         = MatMultTranspose_MPIAIJHIPSPARSE;
631:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJHIPSPARSE;
632:   A->ops->destroy               = MatDestroy_MPIAIJHIPSPARSE;
633:   A->ops->zeroentries           = MatZeroEntries_MPIAIJHIPSPARSE;
634:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;

636:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJHIPSPARSE));
637:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE));
638:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE));
639:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_MPIAIJHIPSPARSE));
640:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJHIPSPARSE));
641:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJHIPSPARSE));
642: #if defined(PETSC_HAVE_HYPRE)
643:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
644: #endif
645:   PetscFunctionReturn(PETSC_SUCCESS);
646: }

648: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJHIPSPARSE(Mat A)
649: {
650:   PetscFunctionBegin;
651:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
652:   PetscCall(MatCreate_MPIAIJ(A));
653:   PetscCall(MatConvert_MPIAIJ_MPIAIJHIPSPARSE(A, MATMPIAIJHIPSPARSE, MAT_INPLACE_MATRIX, &A));
654:   PetscFunctionReturn(PETSC_SUCCESS);
655: }

657: /*@
658:    MatCreateAIJHIPSPARSE - Creates a sparse matrix in AIJ (compressed row) format
659:    (the default parallel PETSc format).  This matrix will ultimately pushed down
660:    to AMD GPUs and use the HIPSPARSE library for calculations. For good matrix
661:    assembly performance the user should preallocate the matrix storage by setting
662:    the parameter `nz` (or the array `nnz`).

664:    Collective

666:    Input Parameters:
667: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
668: .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
669:            This value should be the same as the local size used in creating the
670:            y vector for the matrix-vector product y = Ax.
671: .  n - This value should be the same as the local size used in creating the
672:        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
673:        calculated if `N` is given) For square matrices `n` is almost always `m`.
674: .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
675: .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
676: .  d_nz - number of nonzeros per row (same for all rows), for the "diagonal" portion of the matrix
677: .  d_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "diagonal" portion of the matrix
678: .  o_nz - number of nonzeros per row (same for all rows), for the "off-diagonal" portion of the matrix
679: -  o_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "off-diagonal" portion of the matrix

681:    Output Parameter:
682: .  A - the matrix

684:    Level: intermediate

686:    Notes:
687:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
688:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
689:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

691:    If `d_nnz` (`o_nnz`) is given then `d_nz` (`o_nz`) is ignored

693:    The `MATAIJ` format (compressed row storage), is fully compatible with standard Fortran
694:    storage.  That is, the stored row and column indices can begin at
695:    either one (as in Fortran) or zero.

697:    Specify the preallocated storage with either `d_nz` (`o_nz`) or `d_nnz` (`o_nnz`) (not both).
698:    Set `d_nz` (`o_nz`) = `PETSC_DEFAULT` and `d_nnz` (`o_nnz`) = `NULL` for PETSc to control dynamic memory
699:    allocation.

701: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJHIPSPARSE`, `MATAIJHIPSPARSE`
702: @*/
703: PetscErrorCode MatCreateAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
704: {
705:   PetscMPIInt size;

707:   PetscFunctionBegin;
708:   PetscCall(MatCreate(comm, A));
709:   PetscCall(MatSetSizes(*A, m, n, M, N));
710:   PetscCallMPI(MPI_Comm_size(comm, &size));
711:   if (size > 1) {
712:     PetscCall(MatSetType(*A, MATMPIAIJHIPSPARSE));
713:     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
714:   } else {
715:     PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
716:     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
717:   }
718:   PetscFunctionReturn(PETSC_SUCCESS);
719: }

721: /*MC
722:    MATAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJHIPSPARSE`.

724:    A matrix type type whose data resides on GPUs. These matrices can be in either
725:    CSR, ELL, or Hybrid format. All matrix calculations are performed on AMD GPUs using the HIPSPARSE library.

727:    This matrix type is identical to `MATSEQAIJHIPSPARSE` when constructed with a single process communicator,
728:    and `MATMPIAIJHIPSPARSE` otherwise.  As a result, for single process communicators,
729:    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
730:    for communicators controlling multiple processes.  It is recommended that you call both of
731:    the above preallocation routines for simplicity.

733:    Options Database Keys:
734: +  -mat_type mpiaijhipsparse - sets the matrix type to `MATMPIAIJHIPSPARSE`
735: .  -mat_hipsparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
736: .  -mat_hipsparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
737: -  -mat_hipsparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).

739:   Level: beginner

741: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJHIPSPARSE()`, `MATSEQAIJHIPSPARSE`, `MATMPIAIJHIPSPARSE`, `MatCreateSeqAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
742: M*/

744: /*MC
745:    MATMPIAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJHIPSPARSE`.

747:   Level: beginner

749: .seealso: [](ch_matrices), `Mat`, `MATAIJHIPSPARSE`, `MATSEQAIJHIPSPARSE`
750: M*/

752: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
753: PetscErrorCode MatHIPSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
754: {
755:   PetscSplitCSRDataStructure d_mat;
756:   PetscMPIInt                size;
757:   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
758:   PetscScalar               *aa = NULL, *ba = NULL;
759:   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
760:   Mat_SeqAIJHIPSPARSE       *hipsparsestructA = NULL;
761:   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;

763:   PetscFunctionBegin;
764:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
765:   if (A->factortype != MAT_FACTOR_NONE) {
766:     *B = NULL;
767:     PetscFunctionReturn(PETSC_SUCCESS);
768:   }
769:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
770:   // get jaca
771:   if (size == 1) {
772:     PetscBool isseqaij;

774:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
775:     if (isseqaij) {
776:       jaca = (Mat_SeqAIJ *)A->data;
777:       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
778:       hipsparsestructA = (Mat_SeqAIJHIPSPARSE *)A->spptr;
779:       d_mat            = hipsparsestructA->deviceMat;
780:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
781:     } else {
782:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
783:       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
784:       Mat_MPIAIJHIPSPARSE *spptr = (Mat_MPIAIJHIPSPARSE *)aij->spptr;
785:       jaca                       = (Mat_SeqAIJ *)aij->A->data;
786:       hipsparsestructA           = (Mat_SeqAIJHIPSPARSE *)aij->A->spptr;
787:       d_mat                      = spptr->deviceMat;
788:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(aij->A));
789:     }
790:     if (hipsparsestructA->format == MAT_HIPSPARSE_CSR) {
791:       Mat_SeqAIJHIPSPARSEMultStruct *matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat;
792:       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJHIPSPARSEMultStruct for A");
793:       matrixA = (CsrMatrix *)matstruct->mat;
794:       bi      = NULL;
795:       bj      = NULL;
796:       ba      = NULL;
797:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_HIPSPARSE_CSR");
798:   } else {
799:     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
800:     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
801:     jaca                       = (Mat_SeqAIJ *)aij->A->data;
802:     jacb                       = (Mat_SeqAIJ *)aij->B->data;
803:     Mat_MPIAIJHIPSPARSE *spptr = (Mat_MPIAIJHIPSPARSE *)aij->spptr;

805:     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE) or MatSetOption(A, MAT_IGNORE_OFF_PROC_ENTRIES, PETSC_TRUE)");
806:     hipsparsestructA                      = (Mat_SeqAIJHIPSPARSE *)aij->A->spptr;
807:     Mat_SeqAIJHIPSPARSE *hipsparsestructB = (Mat_SeqAIJHIPSPARSE *)aij->B->spptr;
808:     PetscCheck(hipsparsestructA->format == MAT_HIPSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_HIPSPARSE_CSR");
809:     if (hipsparsestructB->format == MAT_HIPSPARSE_CSR) {
810:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(aij->A));
811:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(aij->B));
812:       Mat_SeqAIJHIPSPARSEMultStruct *matstructA = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructA->mat;
813:       Mat_SeqAIJHIPSPARSEMultStruct *matstructB = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestructB->mat;
814:       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJHIPSPARSEMultStruct for A");
815:       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJHIPSPARSEMultStruct for B");
816:       matrixA = (CsrMatrix *)matstructA->mat;
817:       matrixB = (CsrMatrix *)matstructB->mat;
818:       if (jacb->compressedrow.use) {
819:         if (!hipsparsestructB->rowoffsets_gpu) {
820:           hipsparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
821:           hipsparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
822:         }
823:         bi = thrust::raw_pointer_cast(hipsparsestructB->rowoffsets_gpu->data());
824:       } else {
825:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
826:       }
827:       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
828:       ba = thrust::raw_pointer_cast(matrixB->values->data());
829:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_HIPSPARSE_CSR");
830:     d_mat = spptr->deviceMat;
831:   }
832:   if (jaca->compressedrow.use) {
833:     if (!hipsparsestructA->rowoffsets_gpu) {
834:       hipsparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
835:       hipsparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
836:     }
837:     ai = thrust::raw_pointer_cast(hipsparsestructA->rowoffsets_gpu->data());
838:   } else {
839:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
840:   }
841:   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
842:   aa = thrust::raw_pointer_cast(matrixA->values->data());

844:   if (!d_mat) {
845:     PetscSplitCSRDataStructure h_mat;

847:     // create and populate strucy on host and copy on device
848:     PetscCall(PetscInfo(A, "Create device matrix\n"));
849:     PetscCall(PetscNew(&h_mat));
850:     PetscCallHIP(hipMalloc((void **)&d_mat, sizeof(*d_mat)));
851:     if (size > 1) { /* need the colmap array */
852:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
853:       PetscInt   *colmap;
854:       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;

856:       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");

858:       PetscCall(PetscCalloc1(N + 1, &colmap));
859:       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = ii + 1;
860: #if defined(PETSC_USE_64BIT_INDICES)
861:       { // have to make a long version of these
862:         int      *h_bi32, *h_bj32;
863:         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
864:         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
865:         PetscCallHIP(hipMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), hipMemcpyDeviceToHost));
866:         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
867:         PetscCallHIP(hipMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), hipMemcpyDeviceToHost));
868:         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];

870:         PetscCallHIP(hipMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
871:         PetscCallHIP(hipMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), hipMemcpyHostToDevice));
872:         PetscCallHIP(hipMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
873:         PetscCallHIP(hipMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), hipMemcpyHostToDevice));

875:         h_mat->offdiag.i         = d_bi64;
876:         h_mat->offdiag.j         = d_bj64;
877:         h_mat->allocated_indices = PETSC_TRUE;

879:         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
880:       }
881: #else
882:       h_mat->offdiag.i         = (PetscInt *)bi;
883:       h_mat->offdiag.j         = (PetscInt *)bj;
884:       h_mat->allocated_indices = PETSC_FALSE;
885: #endif
886:       h_mat->offdiag.a = ba;
887:       h_mat->offdiag.n = A->rmap->n;

889:       PetscCallHIP(hipMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
890:       PetscCallHIP(hipMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), hipMemcpyHostToDevice));
891:       PetscCall(PetscFree(colmap));
892:     }
893:     h_mat->rstart = A->rmap->rstart;
894:     h_mat->rend   = A->rmap->rend;
895:     h_mat->cstart = A->cmap->rstart;
896:     h_mat->cend   = A->cmap->rend;
897:     h_mat->M      = A->cmap->N;
898: #if defined(PETSC_USE_64BIT_INDICES)
899:     {
900:       int      *h_ai32, *h_aj32;
901:       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
902:       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
903:       PetscCallHIP(hipMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), hipMemcpyDeviceToHost));
904:       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
905:       PetscCallHIP(hipMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), hipMemcpyDeviceToHost));
906:       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];

908:       PetscCallHIP(hipMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
909:       PetscCallHIP(hipMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), hipMemcpyHostToDevice));
910:       PetscCallHIP(hipMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
911:       PetscCallHIP(hipMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), hipMemcpyHostToDevice));

913:       h_mat->diag.i            = d_ai64;
914:       h_mat->diag.j            = d_aj64;
915:       h_mat->allocated_indices = PETSC_TRUE;

917:       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
918:     }
919: #else
920:     h_mat->diag.i            = (PetscInt *)ai;
921:     h_mat->diag.j            = (PetscInt *)aj;
922:     h_mat->allocated_indices = PETSC_FALSE;
923: #endif
924:     h_mat->diag.a = aa;
925:     h_mat->diag.n = A->rmap->n;
926:     h_mat->rank   = PetscGlobalRank;
927:     // copy pointers and metadata to device
928:     PetscCallHIP(hipMemcpy(d_mat, h_mat, sizeof(*d_mat), hipMemcpyHostToDevice));
929:     PetscCall(PetscFree(h_mat));
930:   } else {
931:     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
932:   }
933:   *B             = d_mat;
934:   A->offloadmask = PETSC_OFFLOAD_GPU;
935:   PetscFunctionReturn(PETSC_SUCCESS);
936: }