Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3: #include <petscconf.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7: #include <thrust/advance.h>
8: #include <thrust/partition.h>
9: #include <thrust/sort.h>
10: #include <thrust/unique.h>
11: #include <petscsf.h>
13: struct VecCUDAEquals {
14: template <typename Tuple>
15: __host__ __device__ void operator()(Tuple t)
16: {
17: thrust::get<1>(t) = thrust::get<0>(t);
18: }
19: };
21: static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
22: {
23: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
24: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
26: PetscFunctionBegin;
27: if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
28: if (cusparseStruct->use_extended_coo) {
29: PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
30: PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
31: PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
32: PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
33: PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
34: PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
35: PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
36: PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
37: PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
38: PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
39: PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
40: PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
41: PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
42: }
43: cusparseStruct->use_extended_coo = PETSC_FALSE;
44: delete cusparseStruct->coo_p;
45: delete cusparseStruct->coo_pw;
46: cusparseStruct->coo_p = NULL;
47: cusparseStruct->coo_pw = NULL;
48: PetscFunctionReturn(PETSC_SUCCESS);
49: }
51: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
52: {
53: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
54: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
55: PetscInt n = cusp->coo_nd + cusp->coo_no;
57: PetscFunctionBegin;
58: if (cusp->coo_p && v) {
59: thrust::device_ptr<const PetscScalar> d_v;
60: THRUSTARRAY *w = NULL;
62: if (isCudaMem(v)) {
63: d_v = thrust::device_pointer_cast(v);
64: } else {
65: w = new THRUSTARRAY(n);
66: w->assign(v, v + n);
67: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
68: d_v = w->data();
69: }
71: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
72: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
73: PetscCall(PetscLogGpuTimeBegin());
74: thrust::for_each(zibit, zieit, VecCUDAEquals());
75: PetscCall(PetscLogGpuTimeEnd());
76: delete w;
77: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
78: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
79: } else {
80: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
81: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
82: }
83: PetscFunctionReturn(PETSC_SUCCESS);
84: }
86: template <typename Tuple>
87: struct IsNotOffDiagT {
88: PetscInt _cstart, _cend;
90: IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
91: __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
92: };
94: struct IsOffDiag {
95: PetscInt _cstart, _cend;
97: IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
98: __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
99: };
101: struct GlobToLoc {
102: PetscInt _start;
104: GlobToLoc(PetscInt start) : _start(start) { }
105: __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
106: };
108: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109: {
110: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
111: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
112: PetscInt N, *jj;
113: size_t noff = 0;
114: THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115: THRUSTINTARRAY d_j(n);
116: ISLocalToGlobalMapping l2g;
118: PetscFunctionBegin;
119: PetscCall(MatDestroy(&b->A));
120: PetscCall(MatDestroy(&b->B));
122: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
123: d_i.assign(coo_i, coo_i + n);
124: d_j.assign(coo_j, coo_j + n);
125: PetscCall(PetscLogGpuTimeBegin());
126: auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
127: auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
128: if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
129: cusp->coo_p = new THRUSTINTARRAY(n);
130: cusp->coo_pw = new THRUSTARRAY(n);
131: thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
132: auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
133: auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
134: auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
135: firstoffd = mzipp.get_iterator_tuple().get<1>();
136: }
137: cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
138: cusp->coo_no = thrust::distance(firstoffd, d_j.end());
140: /* from global to local */
141: thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
142: thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
143: PetscCall(PetscLogGpuTimeEnd());
145: /* copy offdiag column indices to map on the CPU */
146: PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
147: PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
148: auto o_j = d_j.begin();
149: PetscCall(PetscLogGpuTimeBegin());
150: thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
151: thrust::sort(thrust::device, o_j, d_j.end());
152: auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
153: PetscCall(PetscLogGpuTimeEnd());
154: noff = thrust::distance(o_j, wit);
155: /* allocate the garray, the columns of B will not be mapped in the multiply setup */
156: PetscCall(PetscMalloc1(noff, &b->garray));
157: PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
158: PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
159: PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
160: PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
161: PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
162: PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
163: PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
165: PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
166: PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
167: PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
168: PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
169: PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
170: PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
172: /* GPU memory, cusparse specific call handles it internally */
173: PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
174: PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
175: PetscCall(PetscFree(jj));
177: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
178: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
180: PetscCall(MatBindToCPU(b->A, B->boundtocpu));
181: PetscCall(MatBindToCPU(b->B, B->boundtocpu));
182: PetscCall(MatSetUpMultiply_MPIAIJ(B));
183: PetscFunctionReturn(PETSC_SUCCESS);
184: }
186: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187: {
188: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
189: Mat_MPIAIJCUSPARSE *mpidev;
190: PetscBool coo_basic = PETSC_TRUE;
191: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
192: PetscInt rstart, rend;
194: PetscFunctionBegin;
195: PetscCall(PetscFree(mpiaij->garray));
196: PetscCall(VecDestroy(&mpiaij->lvec));
197: #if defined(PETSC_USE_CTABLE)
198: PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
199: #else
200: PetscCall(PetscFree(mpiaij->colmap));
201: #endif
202: PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
203: mat->assembled = PETSC_FALSE;
204: mat->was_assembled = PETSC_FALSE;
205: PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
206: PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
207: if (coo_i) {
208: PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
209: PetscCall(PetscGetMemType(coo_i, &mtype));
210: if (PetscMemTypeHost(mtype)) {
211: for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
212: if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
213: coo_basic = PETSC_FALSE;
214: break;
215: }
216: }
217: }
218: }
219: /* All ranks must agree on the value of coo_basic */
220: PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
221: if (coo_basic) {
222: PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
223: } else {
224: PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
225: mat->offloadmask = PETSC_OFFLOAD_CPU;
226: /* creates the GPU memory */
227: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
228: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
229: mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
230: mpidev->use_extended_coo = PETSC_TRUE;
232: PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
233: PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
235: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
236: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
238: PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
239: PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
240: PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
242: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
243: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
244: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
246: PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
247: PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
248: PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
250: PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
251: PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
253: PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
254: PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
256: PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257: PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
258: PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
260: PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261: PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
262: PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
264: PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
265: }
266: PetscFunctionReturn(PETSC_SUCCESS);
267: }
269: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
270: {
271: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
272: const PetscCount grid_size = gridDim.x * blockDim.x;
273: for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
274: }
276: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
277: {
278: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
279: const PetscCount grid_size = gridDim.x * blockDim.x;
280: for (; i < Annz + Bnnz; i += grid_size) {
281: PetscScalar sum = 0.0;
282: if (i < Annz) {
283: for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
284: Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
285: } else {
286: i -= Annz;
287: for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
288: Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
289: }
290: }
291: }
293: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
294: {
295: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
296: const PetscCount grid_size = gridDim.x * blockDim.x;
297: for (; i < Annz2 + Bnnz2; i += grid_size) {
298: if (i < Annz2) {
299: for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
300: } else {
301: i -= Annz2;
302: for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
303: }
304: }
305: }
307: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
308: {
309: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
310: Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
311: Mat A = mpiaij->A, B = mpiaij->B;
312: PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
313: PetscScalar *Aa, *Ba = NULL;
314: PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
315: const PetscScalar *v1 = v;
316: const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
317: const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
318: const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
319: const PetscCount *Cperm1 = mpidev->Cperm1_d;
320: PetscMemType memtype;
322: PetscFunctionBegin;
323: if (mpidev->use_extended_coo) {
324: PetscMPIInt size;
326: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
327: PetscCall(PetscGetMemType(v, &memtype));
328: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
329: PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
330: PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
331: }
333: if (imode == INSERT_VALUES) {
334: PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
335: PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
336: } else {
337: PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
338: PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
339: }
341: /* Pack entries to be sent to remote */
342: if (mpiaij->sendlen) {
343: MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
344: PetscCallCUDA(cudaPeekAtLastError());
345: }
347: /* Send remote entries to their owner and overlap the communication with local computation */
348: PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
349: /* Add local entries to A and B */
350: if (Annz + Bnnz > 0) {
351: MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
352: PetscCallCUDA(cudaPeekAtLastError());
353: }
354: PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
356: /* Add received remote entries to A and B */
357: if (Annz2 + Bnnz2 > 0) {
358: MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
359: PetscCallCUDA(cudaPeekAtLastError());
360: }
362: if (imode == INSERT_VALUES) {
363: PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
364: PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
365: } else {
366: PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
367: PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
368: }
369: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
370: } else {
371: PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
372: }
373: mat->offloadmask = PETSC_OFFLOAD_GPU;
374: PetscFunctionReturn(PETSC_SUCCESS);
375: }
377: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378: {
379: Mat Ad, Ao;
380: const PetscInt *cmap;
382: PetscFunctionBegin;
383: PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
384: PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
385: if (glob) {
386: PetscInt cst, i, dn, on, *gidx;
388: PetscCall(MatGetLocalSize(Ad, NULL, &dn));
389: PetscCall(MatGetLocalSize(Ao, NULL, &on));
390: PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
391: PetscCall(PetscMalloc1(dn + on, &gidx));
392: for (i = 0; i < dn; i++) gidx[i] = cst + i;
393: for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
394: PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
395: }
396: PetscFunctionReturn(PETSC_SUCCESS);
397: }
399: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
400: {
401: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
402: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
403: PetscInt i;
405: PetscFunctionBegin;
406: PetscCall(PetscLayoutSetUp(B->rmap));
407: PetscCall(PetscLayoutSetUp(B->cmap));
408: if (PetscDefined(USE_DEBUG) && d_nnz) {
409: for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
410: }
411: if (PetscDefined(USE_DEBUG) && o_nnz) {
412: for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
413: }
414: #if defined(PETSC_USE_CTABLE)
415: PetscCall(PetscHMapIDestroy(&b->colmap));
416: #else
417: PetscCall(PetscFree(b->colmap));
418: #endif
419: PetscCall(PetscFree(b->garray));
420: PetscCall(VecDestroy(&b->lvec));
421: PetscCall(VecScatterDestroy(&b->Mvctx));
422: /* Because the B will have been resized we simply destroy it and create a new one each time */
423: PetscCall(MatDestroy(&b->B));
424: if (!b->A) {
425: PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
426: PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
427: }
428: if (!b->B) {
429: PetscMPIInt size;
430: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
431: PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
432: PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
433: }
434: PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
435: PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
436: PetscCall(MatBindToCPU(b->A, B->boundtocpu));
437: PetscCall(MatBindToCPU(b->B, B->boundtocpu));
438: PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
439: PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
440: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
441: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
442: B->preallocated = PETSC_TRUE;
443: PetscFunctionReturn(PETSC_SUCCESS);
444: }
446: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
447: {
448: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
450: PetscFunctionBegin;
451: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
452: PetscCall((*a->A->ops->mult)(a->A, xx, yy));
453: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
454: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
455: PetscFunctionReturn(PETSC_SUCCESS);
456: }
458: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
459: {
460: Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
462: PetscFunctionBegin;
463: PetscCall(MatZeroEntries(l->A));
464: PetscCall(MatZeroEntries(l->B));
465: PetscFunctionReturn(PETSC_SUCCESS);
466: }
468: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
469: {
470: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
472: PetscFunctionBegin;
473: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
474: PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
475: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
476: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
477: PetscFunctionReturn(PETSC_SUCCESS);
478: }
480: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
481: {
482: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
484: PetscFunctionBegin;
485: PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
486: PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
487: PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
488: PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489: PetscFunctionReturn(PETSC_SUCCESS);
490: }
492: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
493: {
494: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
495: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
497: PetscFunctionBegin;
498: switch (op) {
499: case MAT_CUSPARSE_MULT_DIAG:
500: cusparseStruct->diagGPUMatFormat = format;
501: break;
502: case MAT_CUSPARSE_MULT_OFFDIAG:
503: cusparseStruct->offdiagGPUMatFormat = format;
504: break;
505: case MAT_CUSPARSE_ALL:
506: cusparseStruct->diagGPUMatFormat = format;
507: cusparseStruct->offdiagGPUMatFormat = format;
508: break;
509: default:
510: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
511: }
512: PetscFunctionReturn(PETSC_SUCCESS);
513: }
515: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
516: {
517: MatCUSPARSEStorageFormat format;
518: PetscBool flg;
519: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
520: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
522: PetscFunctionBegin;
523: PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
524: if (A->factortype == MAT_FACTOR_NONE) {
525: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
526: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
527: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
528: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
529: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
530: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
531: }
532: PetscOptionsHeadEnd();
533: PetscFunctionReturn(PETSC_SUCCESS);
534: }
536: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
537: {
538: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
539: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
540: PetscObjectState onnz = A->nonzerostate;
542: PetscFunctionBegin;
543: PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
544: if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
545: if (onnz != A->nonzerostate && cusp->deviceMat) {
546: PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
548: PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
549: PetscCall(PetscNew(&h_mat));
550: PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
551: PetscCallCUDA(cudaFree(h_mat->colmap));
552: if (h_mat->allocated_indices) {
553: PetscCallCUDA(cudaFree(h_mat->diag.i));
554: PetscCallCUDA(cudaFree(h_mat->diag.j));
555: if (h_mat->offdiag.j) {
556: PetscCallCUDA(cudaFree(h_mat->offdiag.i));
557: PetscCallCUDA(cudaFree(h_mat->offdiag.j));
558: }
559: }
560: PetscCallCUDA(cudaFree(d_mat));
561: PetscCall(PetscFree(h_mat));
562: cusp->deviceMat = NULL;
563: }
564: PetscFunctionReturn(PETSC_SUCCESS);
565: }
567: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
568: {
569: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
570: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
572: PetscFunctionBegin;
573: PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
574: if (cusparseStruct->deviceMat) {
575: PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
577: PetscCall(PetscInfo(A, "Have device matrix\n"));
578: PetscCall(PetscNew(&h_mat));
579: PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
580: PetscCallCUDA(cudaFree(h_mat->colmap));
581: if (h_mat->allocated_indices) {
582: PetscCallCUDA(cudaFree(h_mat->diag.i));
583: PetscCallCUDA(cudaFree(h_mat->diag.j));
584: if (h_mat->offdiag.j) {
585: PetscCallCUDA(cudaFree(h_mat->offdiag.i));
586: PetscCallCUDA(cudaFree(h_mat->offdiag.j));
587: }
588: }
589: PetscCallCUDA(cudaFree(d_mat));
590: PetscCall(PetscFree(h_mat));
591: }
592: /* Free COO */
593: PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
594: PetscCallCXX(delete cusparseStruct);
595: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
596: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
597: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
598: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
599: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
600: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
601: PetscCall(MatDestroy_MPIAIJ(A));
602: PetscFunctionReturn(PETSC_SUCCESS);
603: }
605: /* defines MatSetValues_MPICUSPARSE_Hash() */
606: #define TYPE AIJ
607: #define TYPE_AIJ
608: #define SUB_TYPE_CUSPARSE
609: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
610: #undef TYPE
611: #undef TYPE_AIJ
612: #undef SUB_TYPE_CUSPARSE
614: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
615: {
616: Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data;
617: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
619: PetscFunctionBegin;
620: PetscCall(MatSetUp_MPI_Hash(A));
621: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
622: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
623: A->preallocated = PETSC_TRUE;
624: PetscFunctionReturn(PETSC_SUCCESS);
625: }
627: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
628: {
629: Mat_MPIAIJ *a;
630: Mat A;
632: PetscFunctionBegin;
633: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
634: if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
635: else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
636: A = *newmat;
637: A->boundtocpu = PETSC_FALSE;
638: PetscCall(PetscFree(A->defaultvectype));
639: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
641: a = (Mat_MPIAIJ *)A->data;
642: if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
643: if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
644: if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
646: if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
648: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
649: A->ops->mult = MatMult_MPIAIJCUSPARSE;
650: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
651: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
652: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
653: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
654: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
655: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
656: A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE;
658: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
659: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
660: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
661: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
662: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
663: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
664: #if defined(PETSC_HAVE_HYPRE)
665: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
666: #endif
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
671: {
672: PetscFunctionBegin;
673: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
674: PetscCall(MatCreate_MPIAIJ(A));
675: PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
676: PetscFunctionReturn(PETSC_SUCCESS);
677: }
679: /*@
680: MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
681: (the default parallel PETSc format). This matrix will ultimately pushed down
682: to NVIDIA GPUs and use the CuSPARSE library for calculations.
684: Collective
686: Input Parameters:
687: + comm - MPI communicator, set to `PETSC_COMM_SELF`
688: . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
689: This value should be the same as the local size used in creating the
690: y vector for the matrix-vector product y = Ax.
691: . n - This value should be the same as the local size used in creating the
692: x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
693: calculated if `N` is given) For square matrices `n` is almost always `m`.
694: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
695: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
696: . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix
697: (same value is used for all local rows)
698: . d_nnz - array containing the number of nonzeros in the various rows of the
699: DIAGONAL portion of the local submatrix (possibly different for each row)
700: or `NULL`, if `d_nz` is used to specify the nonzero structure.
701: The size of this array is equal to the number of local rows, i.e `m`.
702: For matrices you plan to factor you must leave room for the diagonal entry and
703: put in the entry even if it is zero.
704: . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local
705: submatrix (same value is used for all local rows).
706: - o_nnz - array containing the number of nonzeros in the various rows of the
707: OFF-DIAGONAL portion of the local submatrix (possibly different for
708: each row) or `NULL`, if `o_nz` is used to specify the nonzero
709: structure. The size of this array is equal to the number
710: of local rows, i.e `m`.
712: Output Parameter:
713: . A - the matrix
715: Level: intermediate
717: Notes:
718: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
719: MatXXXXSetPreallocation() paradigm instead of this routine directly.
720: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
722: The AIJ format, also called the
723: compressed row storage), is fully compatible with standard Fortran
724: storage. That is, the stored row and column indices can begin at
725: either one (as in Fortran) or zero.
727: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
728: @*/
729: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
730: {
731: PetscMPIInt size;
733: PetscFunctionBegin;
734: PetscCall(MatCreate(comm, A));
735: PetscCall(MatSetSizes(*A, m, n, M, N));
736: PetscCallMPI(MPI_Comm_size(comm, &size));
737: if (size > 1) {
738: PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
739: PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
740: } else {
741: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
742: PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
743: }
744: PetscFunctionReturn(PETSC_SUCCESS);
745: }
747: /*MC
748: MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
750: A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
751: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
752: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
754: This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
755: and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators,
756: `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
757: for communicators controlling multiple processes. It is recommended that you call both of
758: the above preallocation routines for simplicity.
760: Options Database Keys:
761: + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
762: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
763: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
764: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
766: Level: beginner
768: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
769: M*/
771: /*MC
772: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
774: Level: beginner
776: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
777: M*/
779: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
780: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
781: {
782: PetscSplitCSRDataStructure d_mat;
783: PetscMPIInt size;
784: int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
785: PetscScalar *aa = NULL, *ba = NULL;
786: Mat_SeqAIJ *jaca = NULL, *jacb = NULL;
787: Mat_SeqAIJCUSPARSE *cusparsestructA = NULL;
788: CsrMatrix *matrixA = NULL, *matrixB = NULL;
790: PetscFunctionBegin;
791: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
792: if (A->factortype != MAT_FACTOR_NONE) {
793: *B = NULL;
794: PetscFunctionReturn(PETSC_SUCCESS);
795: }
796: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
797: // get jaca
798: if (size == 1) {
799: PetscBool isseqaij;
801: PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
802: if (isseqaij) {
803: jaca = (Mat_SeqAIJ *)A->data;
804: PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
805: cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
806: d_mat = cusparsestructA->deviceMat;
807: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
808: } else {
809: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
810: PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
811: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
812: jaca = (Mat_SeqAIJ *)aij->A->data;
813: cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
814: d_mat = spptr->deviceMat;
815: PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
816: }
817: if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
818: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
819: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
820: matrixA = (CsrMatrix *)matstruct->mat;
821: bi = NULL;
822: bj = NULL;
823: ba = NULL;
824: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
825: } else {
826: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
827: PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
828: jaca = (Mat_SeqAIJ *)aij->A->data;
829: jacb = (Mat_SeqAIJ *)aij->B->data;
830: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
832: PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
833: cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
834: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
835: PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
836: if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
837: PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
838: PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
839: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
840: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
841: PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
842: PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
843: matrixA = (CsrMatrix *)matstructA->mat;
844: matrixB = (CsrMatrix *)matstructB->mat;
845: if (jacb->compressedrow.use) {
846: if (!cusparsestructB->rowoffsets_gpu) {
847: cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
848: cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
849: }
850: bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
851: } else {
852: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
853: }
854: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
855: ba = thrust::raw_pointer_cast(matrixB->values->data());
856: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
857: d_mat = spptr->deviceMat;
858: }
859: if (jaca->compressedrow.use) {
860: if (!cusparsestructA->rowoffsets_gpu) {
861: cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
862: cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
863: }
864: ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
865: } else {
866: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
867: }
868: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
869: aa = thrust::raw_pointer_cast(matrixA->values->data());
871: if (!d_mat) {
872: PetscSplitCSRDataStructure h_mat;
874: // create and populate strucy on host and copy on device
875: PetscCall(PetscInfo(A, "Create device matrix\n"));
876: PetscCall(PetscNew(&h_mat));
877: PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
878: if (size > 1) { /* need the colmap array */
879: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
880: PetscInt *colmap;
881: PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N;
883: PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
885: PetscCall(PetscCalloc1(N + 1, &colmap));
886: for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
887: #if defined(PETSC_USE_64BIT_INDICES)
888: { // have to make a long version of these
889: int *h_bi32, *h_bj32;
890: PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
891: PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
892: PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
893: for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
894: PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
895: for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
897: PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
898: PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
899: PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
900: PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
902: h_mat->offdiag.i = d_bi64;
903: h_mat->offdiag.j = d_bj64;
904: h_mat->allocated_indices = PETSC_TRUE;
906: PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
907: }
908: #else
909: h_mat->offdiag.i = (PetscInt *)bi;
910: h_mat->offdiag.j = (PetscInt *)bj;
911: h_mat->allocated_indices = PETSC_FALSE;
912: #endif
913: h_mat->offdiag.a = ba;
914: h_mat->offdiag.n = A->rmap->n;
916: PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
917: PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
918: PetscCall(PetscFree(colmap));
919: }
920: h_mat->rstart = A->rmap->rstart;
921: h_mat->rend = A->rmap->rend;
922: h_mat->cstart = A->cmap->rstart;
923: h_mat->cend = A->cmap->rend;
924: h_mat->M = A->cmap->N;
925: #if defined(PETSC_USE_64BIT_INDICES)
926: {
927: int *h_ai32, *h_aj32;
928: PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
930: static_assert(sizeof(PetscInt) == 8, "");
931: PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
932: PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
933: for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
934: PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
935: for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
937: PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
938: PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
939: PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
940: PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
942: h_mat->diag.i = d_ai64;
943: h_mat->diag.j = d_aj64;
944: h_mat->allocated_indices = PETSC_TRUE;
946: PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
947: }
948: #else
949: h_mat->diag.i = (PetscInt *)ai;
950: h_mat->diag.j = (PetscInt *)aj;
951: h_mat->allocated_indices = PETSC_FALSE;
952: #endif
953: h_mat->diag.a = aa;
954: h_mat->diag.n = A->rmap->n;
955: h_mat->rank = PetscGlobalRank;
956: // copy pointers and metadata to device
957: PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
958: PetscCall(PetscFree(h_mat));
959: } else {
960: PetscCall(PetscInfo(A, "Reusing device matrix\n"));
961: }
962: *B = d_mat;
963: A->offloadmask = PETSC_OFFLOAD_GPU;
964: PetscFunctionReturn(PETSC_SUCCESS);
965: }