Actual source code: mpiaijcusparse.cu

  1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  3: #include <petscconf.h>
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  7: #include <thrust/advance.h>
  8: #include <thrust/partition.h>
  9: #include <thrust/sort.h>
 10: #include <thrust/unique.h>
 11: #include <petscsf.h>

 13: struct VecCUDAEquals {
 14:   template <typename Tuple>
 15:   __host__ __device__ void operator()(Tuple t)
 16:   {
 17:     thrust::get<1>(t) = thrust::get<0>(t);
 18:   }
 19: };

 21: static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
 22: {
 23:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
 24:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;

 26:   PetscFunctionBegin;
 27:   if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
 28:   if (cusparseStruct->use_extended_coo) {
 29:     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
 30:     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
 31:     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
 32:     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
 33:     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
 34:     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
 35:     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
 36:     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
 37:     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
 38:     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
 39:     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
 40:     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
 41:     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
 42:   }
 43:   cusparseStruct->use_extended_coo = PETSC_FALSE;
 44:   delete cusparseStruct->coo_p;
 45:   delete cusparseStruct->coo_pw;
 46:   cusparseStruct->coo_p  = NULL;
 47:   cusparseStruct->coo_pw = NULL;
 48:   PetscFunctionReturn(PETSC_SUCCESS);
 49: }

 51: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
 52: {
 53:   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
 54:   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
 55:   PetscInt            n    = cusp->coo_nd + cusp->coo_no;

 57:   PetscFunctionBegin;
 58:   if (cusp->coo_p && v) {
 59:     thrust::device_ptr<const PetscScalar> d_v;
 60:     THRUSTARRAY                          *w = NULL;

 62:     if (isCudaMem(v)) {
 63:       d_v = thrust::device_pointer_cast(v);
 64:     } else {
 65:       w = new THRUSTARRAY(n);
 66:       w->assign(v, v + n);
 67:       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
 68:       d_v = w->data();
 69:     }

 71:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
 72:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
 73:     PetscCall(PetscLogGpuTimeBegin());
 74:     thrust::for_each(zibit, zieit, VecCUDAEquals());
 75:     PetscCall(PetscLogGpuTimeEnd());
 76:     delete w;
 77:     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
 78:     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
 79:   } else {
 80:     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
 81:     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
 82:   }
 83:   PetscFunctionReturn(PETSC_SUCCESS);
 84: }

 86: template <typename Tuple>
 87: struct IsNotOffDiagT {
 88:   PetscInt _cstart, _cend;

 90:   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 91:   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
 92: };

 94: struct IsOffDiag {
 95:   PetscInt _cstart, _cend;

 97:   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 98:   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
 99: };

101: struct GlobToLoc {
102:   PetscInt _start;

104:   GlobToLoc(PetscInt start) : _start(start) { }
105:   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
106: };

108: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109: {
110:   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
111:   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
112:   PetscInt               N, *jj;
113:   size_t                 noff = 0;
114:   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115:   THRUSTINTARRAY         d_j(n);
116:   ISLocalToGlobalMapping l2g;

118:   PetscFunctionBegin;
119:   PetscCall(MatDestroy(&b->A));
120:   PetscCall(MatDestroy(&b->B));

122:   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
123:   d_i.assign(coo_i, coo_i + n);
124:   d_j.assign(coo_j, coo_j + n);
125:   PetscCall(PetscLogGpuTimeBegin());
126:   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
127:   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
128:   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
129:     cusp->coo_p  = new THRUSTINTARRAY(n);
130:     cusp->coo_pw = new THRUSTARRAY(n);
131:     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
132:     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
133:     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
134:     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
135:     firstoffd  = mzipp.get_iterator_tuple().get<1>();
136:   }
137:   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
138:   cusp->coo_no = thrust::distance(firstoffd, d_j.end());

140:   /* from global to local */
141:   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
142:   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
143:   PetscCall(PetscLogGpuTimeEnd());

145:   /* copy offdiag column indices to map on the CPU */
146:   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
147:   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
148:   auto o_j = d_j.begin();
149:   PetscCall(PetscLogGpuTimeBegin());
150:   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
151:   thrust::sort(thrust::device, o_j, d_j.end());
152:   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
153:   PetscCall(PetscLogGpuTimeEnd());
154:   noff = thrust::distance(o_j, wit);
155:   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
156:   PetscCall(PetscMalloc1(noff, &b->garray));
157:   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
158:   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
159:   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
160:   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
161:   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
162:   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
163:   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));

165:   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
166:   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
167:   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
168:   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
169:   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
170:   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));

172:   /* GPU memory, cusparse specific call handles it internally */
173:   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
174:   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
175:   PetscCall(PetscFree(jj));

177:   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
178:   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));

180:   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
181:   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
182:   PetscCall(MatSetUpMultiply_MPIAIJ(B));
183:   PetscFunctionReturn(PETSC_SUCCESS);
184: }

186: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187: {
188:   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
189:   Mat_MPIAIJCUSPARSE *mpidev;
190:   PetscBool           coo_basic = PETSC_TRUE;
191:   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
192:   PetscInt            rstart, rend;

194:   PetscFunctionBegin;
195:   PetscCall(PetscFree(mpiaij->garray));
196:   PetscCall(VecDestroy(&mpiaij->lvec));
197: #if defined(PETSC_USE_CTABLE)
198:   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
199: #else
200:   PetscCall(PetscFree(mpiaij->colmap));
201: #endif
202:   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
203:   mat->assembled     = PETSC_FALSE;
204:   mat->was_assembled = PETSC_FALSE;
205:   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
206:   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
207:   if (coo_i) {
208:     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
209:     PetscCall(PetscGetMemType(coo_i, &mtype));
210:     if (PetscMemTypeHost(mtype)) {
211:       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
212:         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
213:           coo_basic = PETSC_FALSE;
214:           break;
215:         }
216:       }
217:     }
218:   }
219:   /* All ranks must agree on the value of coo_basic */
220:   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
221:   if (coo_basic) {
222:     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
223:   } else {
224:     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
225:     mat->offloadmask = PETSC_OFFLOAD_CPU;
226:     /* creates the GPU memory */
227:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
228:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
229:     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
230:     mpidev->use_extended_coo = PETSC_TRUE;

232:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
233:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));

235:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
236:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));

238:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
239:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
240:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));

242:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
243:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
244:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));

246:     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
247:     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
248:     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));

250:     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
251:     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));

253:     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
254:     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));

256:     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257:     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
258:     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));

260:     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261:     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
262:     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));

264:     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
265:   }
266:   PetscFunctionReturn(PETSC_SUCCESS);
267: }

269: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
270: {
271:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
272:   const PetscCount grid_size = gridDim.x * blockDim.x;
273:   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
274: }

276: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
277: {
278:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
279:   const PetscCount grid_size = gridDim.x * blockDim.x;
280:   for (; i < Annz + Bnnz; i += grid_size) {
281:     PetscScalar sum = 0.0;
282:     if (i < Annz) {
283:       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
284:       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
285:     } else {
286:       i -= Annz;
287:       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
288:       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
289:     }
290:   }
291: }

293: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
294: {
295:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
296:   const PetscCount grid_size = gridDim.x * blockDim.x;
297:   for (; i < Annz2 + Bnnz2; i += grid_size) {
298:     if (i < Annz2) {
299:       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
300:     } else {
301:       i -= Annz2;
302:       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
303:     }
304:   }
305: }

307: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
308: {
309:   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
310:   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
311:   Mat                 A = mpiaij->A, B = mpiaij->B;
312:   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
313:   PetscScalar        *Aa, *Ba = NULL;
314:   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
315:   const PetscScalar  *v1     = v;
316:   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
317:   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
318:   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
319:   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
320:   PetscMemType        memtype;

322:   PetscFunctionBegin;
323:   if (mpidev->use_extended_coo) {
324:     PetscMPIInt size;

326:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
327:     PetscCall(PetscGetMemType(v, &memtype));
328:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
329:       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
330:       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
331:     }

333:     if (imode == INSERT_VALUES) {
334:       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
335:       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
336:     } else {
337:       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
338:       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
339:     }

341:     /* Pack entries to be sent to remote */
342:     if (mpiaij->sendlen) {
343:       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
344:       PetscCallCUDA(cudaPeekAtLastError());
345:     }

347:     /* Send remote entries to their owner and overlap the communication with local computation */
348:     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
349:     /* Add local entries to A and B */
350:     if (Annz + Bnnz > 0) {
351:       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
352:       PetscCallCUDA(cudaPeekAtLastError());
353:     }
354:     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));

356:     /* Add received remote entries to A and B */
357:     if (Annz2 + Bnnz2 > 0) {
358:       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
359:       PetscCallCUDA(cudaPeekAtLastError());
360:     }

362:     if (imode == INSERT_VALUES) {
363:       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
364:       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
365:     } else {
366:       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
367:       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
368:     }
369:     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
370:   } else {
371:     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
372:   }
373:   mat->offloadmask = PETSC_OFFLOAD_GPU;
374:   PetscFunctionReturn(PETSC_SUCCESS);
375: }

377: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378: {
379:   Mat             Ad, Ao;
380:   const PetscInt *cmap;

382:   PetscFunctionBegin;
383:   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
384:   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
385:   if (glob) {
386:     PetscInt cst, i, dn, on, *gidx;

388:     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
389:     PetscCall(MatGetLocalSize(Ao, NULL, &on));
390:     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
391:     PetscCall(PetscMalloc1(dn + on, &gidx));
392:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
393:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
394:     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
395:   }
396:   PetscFunctionReturn(PETSC_SUCCESS);
397: }

399: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
400: {
401:   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
402:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
403:   PetscInt            i;

405:   PetscFunctionBegin;
406:   PetscCall(PetscLayoutSetUp(B->rmap));
407:   PetscCall(PetscLayoutSetUp(B->cmap));
408:   if (PetscDefined(USE_DEBUG) && d_nnz) {
409:     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
410:   }
411:   if (PetscDefined(USE_DEBUG) && o_nnz) {
412:     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
413:   }
414: #if defined(PETSC_USE_CTABLE)
415:   PetscCall(PetscHMapIDestroy(&b->colmap));
416: #else
417:   PetscCall(PetscFree(b->colmap));
418: #endif
419:   PetscCall(PetscFree(b->garray));
420:   PetscCall(VecDestroy(&b->lvec));
421:   PetscCall(VecScatterDestroy(&b->Mvctx));
422:   /* Because the B will have been resized we simply destroy it and create a new one each time */
423:   PetscCall(MatDestroy(&b->B));
424:   if (!b->A) {
425:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
426:     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
427:   }
428:   if (!b->B) {
429:     PetscMPIInt size;
430:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
431:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
432:     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
433:   }
434:   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
435:   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
436:   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
437:   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
438:   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
439:   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
440:   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
441:   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
442:   B->preallocated = PETSC_TRUE;
443:   PetscFunctionReturn(PETSC_SUCCESS);
444: }

446: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
447: {
448:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

450:   PetscFunctionBegin;
451:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
452:   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
453:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
454:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
455:   PetscFunctionReturn(PETSC_SUCCESS);
456: }

458: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
459: {
460:   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;

462:   PetscFunctionBegin;
463:   PetscCall(MatZeroEntries(l->A));
464:   PetscCall(MatZeroEntries(l->B));
465:   PetscFunctionReturn(PETSC_SUCCESS);
466: }

468: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
469: {
470:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

472:   PetscFunctionBegin;
473:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
474:   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
475:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
476:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
477:   PetscFunctionReturn(PETSC_SUCCESS);
478: }

480: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
481: {
482:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

484:   PetscFunctionBegin;
485:   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
486:   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
487:   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
488:   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489:   PetscFunctionReturn(PETSC_SUCCESS);
490: }

492: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
493: {
494:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
495:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

497:   PetscFunctionBegin;
498:   switch (op) {
499:   case MAT_CUSPARSE_MULT_DIAG:
500:     cusparseStruct->diagGPUMatFormat = format;
501:     break;
502:   case MAT_CUSPARSE_MULT_OFFDIAG:
503:     cusparseStruct->offdiagGPUMatFormat = format;
504:     break;
505:   case MAT_CUSPARSE_ALL:
506:     cusparseStruct->diagGPUMatFormat    = format;
507:     cusparseStruct->offdiagGPUMatFormat = format;
508:     break;
509:   default:
510:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
511:   }
512:   PetscFunctionReturn(PETSC_SUCCESS);
513: }

515: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
516: {
517:   MatCUSPARSEStorageFormat format;
518:   PetscBool                flg;
519:   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
520:   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

522:   PetscFunctionBegin;
523:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
524:   if (A->factortype == MAT_FACTOR_NONE) {
525:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
526:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
527:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
528:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
529:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
530:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
531:   }
532:   PetscOptionsHeadEnd();
533:   PetscFunctionReturn(PETSC_SUCCESS);
534: }

536: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
537: {
538:   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
539:   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
540:   PetscObjectState    onnz   = A->nonzerostate;

542:   PetscFunctionBegin;
543:   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
544:   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
545:   if (onnz != A->nonzerostate && cusp->deviceMat) {
546:     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;

548:     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
549:     PetscCall(PetscNew(&h_mat));
550:     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
551:     PetscCallCUDA(cudaFree(h_mat->colmap));
552:     if (h_mat->allocated_indices) {
553:       PetscCallCUDA(cudaFree(h_mat->diag.i));
554:       PetscCallCUDA(cudaFree(h_mat->diag.j));
555:       if (h_mat->offdiag.j) {
556:         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
557:         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
558:       }
559:     }
560:     PetscCallCUDA(cudaFree(d_mat));
561:     PetscCall(PetscFree(h_mat));
562:     cusp->deviceMat = NULL;
563:   }
564:   PetscFunctionReturn(PETSC_SUCCESS);
565: }

567: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
568: {
569:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
570:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;

572:   PetscFunctionBegin;
573:   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
574:   if (cusparseStruct->deviceMat) {
575:     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;

577:     PetscCall(PetscInfo(A, "Have device matrix\n"));
578:     PetscCall(PetscNew(&h_mat));
579:     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
580:     PetscCallCUDA(cudaFree(h_mat->colmap));
581:     if (h_mat->allocated_indices) {
582:       PetscCallCUDA(cudaFree(h_mat->diag.i));
583:       PetscCallCUDA(cudaFree(h_mat->diag.j));
584:       if (h_mat->offdiag.j) {
585:         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
586:         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
587:       }
588:     }
589:     PetscCallCUDA(cudaFree(d_mat));
590:     PetscCall(PetscFree(h_mat));
591:   }
592:   /* Free COO */
593:   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
594:   PetscCallCXX(delete cusparseStruct);
595:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
596:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
597:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
598:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
599:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
600:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
601:   PetscCall(MatDestroy_MPIAIJ(A));
602:   PetscFunctionReturn(PETSC_SUCCESS);
603: }

605: /* defines MatSetValues_MPICUSPARSE_Hash() */
606: #define TYPE AIJ
607: #define TYPE_AIJ
608: #define SUB_TYPE_CUSPARSE
609: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
610: #undef TYPE
611: #undef TYPE_AIJ
612: #undef SUB_TYPE_CUSPARSE

614: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
615: {
616:   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
617:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;

619:   PetscFunctionBegin;
620:   PetscCall(MatSetUp_MPI_Hash(A));
621:   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
622:   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
623:   A->preallocated = PETSC_TRUE;
624:   PetscFunctionReturn(PETSC_SUCCESS);
625: }

627: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
628: {
629:   Mat_MPIAIJ *a;
630:   Mat         A;

632:   PetscFunctionBegin;
633:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
634:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
635:   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
636:   A             = *newmat;
637:   A->boundtocpu = PETSC_FALSE;
638:   PetscCall(PetscFree(A->defaultvectype));
639:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));

641:   a = (Mat_MPIAIJ *)A->data;
642:   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
643:   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
644:   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));

646:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);

648:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
649:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
650:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
651:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
652:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
653:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
654:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
655:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
656:   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;

658:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
659:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
660:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
661:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
662:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
663:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
664: #if defined(PETSC_HAVE_HYPRE)
665:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
666: #endif
667:   PetscFunctionReturn(PETSC_SUCCESS);
668: }

670: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
671: {
672:   PetscFunctionBegin;
673:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
674:   PetscCall(MatCreate_MPIAIJ(A));
675:   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
676:   PetscFunctionReturn(PETSC_SUCCESS);
677: }

679: /*@
680:    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
681:    (the default parallel PETSc format).  This matrix will ultimately pushed down
682:    to NVIDIA GPUs and use the CuSPARSE library for calculations.

684:    Collective

686:    Input Parameters:
687: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
688: .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
689:            This value should be the same as the local size used in creating the
690:            y vector for the matrix-vector product y = Ax.
691: .  n - This value should be the same as the local size used in creating the
692:        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
693:        calculated if `N` is given) For square matrices `n` is almost always `m`.
694: .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
695: .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
696: .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
697:            (same value is used for all local rows)
698: .  d_nnz - array containing the number of nonzeros in the various rows of the
699:            DIAGONAL portion of the local submatrix (possibly different for each row)
700:            or `NULL`, if `d_nz` is used to specify the nonzero structure.
701:            The size of this array is equal to the number of local rows, i.e `m`.
702:            For matrices you plan to factor you must leave room for the diagonal entry and
703:            put in the entry even if it is zero.
704: .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
705:            submatrix (same value is used for all local rows).
706: -  o_nnz - array containing the number of nonzeros in the various rows of the
707:            OFF-DIAGONAL portion of the local submatrix (possibly different for
708:            each row) or `NULL`, if `o_nz` is used to specify the nonzero
709:            structure. The size of this array is equal to the number
710:            of local rows, i.e `m`.

712:    Output Parameter:
713: .  A - the matrix

715:    Level: intermediate

717:    Notes:
718:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
719:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
720:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

722:    The AIJ format, also called the
723:    compressed row storage), is fully compatible with standard Fortran
724:    storage.  That is, the stored row and column indices can begin at
725:    either one (as in Fortran) or zero.

727: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
728: @*/
729: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
730: {
731:   PetscMPIInt size;

733:   PetscFunctionBegin;
734:   PetscCall(MatCreate(comm, A));
735:   PetscCall(MatSetSizes(*A, m, n, M, N));
736:   PetscCallMPI(MPI_Comm_size(comm, &size));
737:   if (size > 1) {
738:     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
739:     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
740:   } else {
741:     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
742:     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
743:   }
744:   PetscFunctionReturn(PETSC_SUCCESS);
745: }

747: /*MC
748:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.

750:    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
751:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
752:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

754:    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
755:    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
756:    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
757:    for communicators controlling multiple processes.  It is recommended that you call both of
758:    the above preallocation routines for simplicity.

760:    Options Database Keys:
761: +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
762: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
763: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
764: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).

766:   Level: beginner

768: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
769: M*/

771: /*MC
772:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.

774:   Level: beginner

776: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
777: M*/

779: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
780: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
781: {
782:   PetscSplitCSRDataStructure d_mat;
783:   PetscMPIInt                size;
784:   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
785:   PetscScalar               *aa = NULL, *ba = NULL;
786:   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
787:   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
788:   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;

790:   PetscFunctionBegin;
791:   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
792:   if (A->factortype != MAT_FACTOR_NONE) {
793:     *B = NULL;
794:     PetscFunctionReturn(PETSC_SUCCESS);
795:   }
796:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
797:   // get jaca
798:   if (size == 1) {
799:     PetscBool isseqaij;

801:     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
802:     if (isseqaij) {
803:       jaca = (Mat_SeqAIJ *)A->data;
804:       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
805:       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
806:       d_mat           = cusparsestructA->deviceMat;
807:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
808:     } else {
809:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
810:       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
811:       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
812:       jaca                      = (Mat_SeqAIJ *)aij->A->data;
813:       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
814:       d_mat                     = spptr->deviceMat;
815:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
816:     }
817:     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
818:       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
819:       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
820:       matrixA = (CsrMatrix *)matstruct->mat;
821:       bi      = NULL;
822:       bj      = NULL;
823:       ba      = NULL;
824:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
825:   } else {
826:     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
827:     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
828:     jaca                      = (Mat_SeqAIJ *)aij->A->data;
829:     jacb                      = (Mat_SeqAIJ *)aij->B->data;
830:     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;

832:     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
833:     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
834:     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
835:     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
836:     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
837:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
838:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
839:       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
840:       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
841:       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
842:       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
843:       matrixA = (CsrMatrix *)matstructA->mat;
844:       matrixB = (CsrMatrix *)matstructB->mat;
845:       if (jacb->compressedrow.use) {
846:         if (!cusparsestructB->rowoffsets_gpu) {
847:           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
848:           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
849:         }
850:         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
851:       } else {
852:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
853:       }
854:       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
855:       ba = thrust::raw_pointer_cast(matrixB->values->data());
856:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
857:     d_mat = spptr->deviceMat;
858:   }
859:   if (jaca->compressedrow.use) {
860:     if (!cusparsestructA->rowoffsets_gpu) {
861:       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
862:       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
863:     }
864:     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
865:   } else {
866:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
867:   }
868:   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
869:   aa = thrust::raw_pointer_cast(matrixA->values->data());

871:   if (!d_mat) {
872:     PetscSplitCSRDataStructure h_mat;

874:     // create and populate strucy on host and copy on device
875:     PetscCall(PetscInfo(A, "Create device matrix\n"));
876:     PetscCall(PetscNew(&h_mat));
877:     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
878:     if (size > 1) { /* need the colmap array */
879:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
880:       PetscInt   *colmap;
881:       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;

883:       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");

885:       PetscCall(PetscCalloc1(N + 1, &colmap));
886:       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
887: #if defined(PETSC_USE_64BIT_INDICES)
888:       { // have to make a long version of these
889:         int      *h_bi32, *h_bj32;
890:         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
891:         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
892:         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
893:         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
894:         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
895:         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];

897:         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
898:         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
899:         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
900:         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));

902:         h_mat->offdiag.i         = d_bi64;
903:         h_mat->offdiag.j         = d_bj64;
904:         h_mat->allocated_indices = PETSC_TRUE;

906:         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
907:       }
908: #else
909:       h_mat->offdiag.i         = (PetscInt *)bi;
910:       h_mat->offdiag.j         = (PetscInt *)bj;
911:       h_mat->allocated_indices = PETSC_FALSE;
912: #endif
913:       h_mat->offdiag.a = ba;
914:       h_mat->offdiag.n = A->rmap->n;

916:       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
917:       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
918:       PetscCall(PetscFree(colmap));
919:     }
920:     h_mat->rstart = A->rmap->rstart;
921:     h_mat->rend   = A->rmap->rend;
922:     h_mat->cstart = A->cmap->rstart;
923:     h_mat->cend   = A->cmap->rend;
924:     h_mat->M      = A->cmap->N;
925: #if defined(PETSC_USE_64BIT_INDICES)
926:     {
927:       int      *h_ai32, *h_aj32;
928:       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;

930:       static_assert(sizeof(PetscInt) == 8, "");
931:       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
932:       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
933:       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
934:       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
935:       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];

937:       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
938:       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
939:       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
940:       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));

942:       h_mat->diag.i            = d_ai64;
943:       h_mat->diag.j            = d_aj64;
944:       h_mat->allocated_indices = PETSC_TRUE;

946:       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
947:     }
948: #else
949:     h_mat->diag.i            = (PetscInt *)ai;
950:     h_mat->diag.j            = (PetscInt *)aj;
951:     h_mat->allocated_indices = PETSC_FALSE;
952: #endif
953:     h_mat->diag.a = aa;
954:     h_mat->diag.n = A->rmap->n;
955:     h_mat->rank   = PetscGlobalRank;
956:     // copy pointers and metadata to device
957:     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
958:     PetscCall(PetscFree(h_mat));
959:   } else {
960:     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
961:   }
962:   *B             = d_mat;
963:   A->offloadmask = PETSC_OFFLOAD_GPU;
964:   PetscFunctionReturn(PETSC_SUCCESS);
965: }