Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <thrust/adjacent_difference.h>
 15: #if PETSC_CPP_VERSION >= 14
 16:   #define PETSC_HAVE_THRUST_ASYNC 1
 17:   // thrust::for_each(thrust::cuda::par.on()) requires C++14
 18:   #include <thrust/async/for_each.h>
 19: #endif
 20: #include <thrust/iterator/constant_iterator.h>
 21: #include <thrust/remove.h>
 22: #include <thrust/sort.h>
 23: #include <thrust/unique.h>

 25: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 26: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
 27: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 28:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 30:   typedef enum {
 31:       CUSPARSE_MV_ALG_DEFAULT = 0,
 32:       CUSPARSE_COOMV_ALG      = 1,
 33:       CUSPARSE_CSRMV_ALG1     = 2,
 34:       CUSPARSE_CSRMV_ALG2     = 3
 35:   } cusparseSpMVAlg_t;

 37:   typedef enum {
 38:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 39:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 40:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 41:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 42:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 43:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 44:       CUSPARSE_SPMM_COO_ALG1    = 1,
 45:       CUSPARSE_SPMM_COO_ALG2    = 2,
 46:       CUSPARSE_SPMM_COO_ALG3    = 3,
 47:       CUSPARSE_SPMM_COO_ALG4    = 5,
 48:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 49:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 50:   } cusparseSpMMAlg_t;

 52:   typedef enum {
 53:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
 54:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
 55:   } cusparseCsr2CscAlg_t;
 56:   */
 57: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 58: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 59: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
 60: #endif

 62: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 63: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 64: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
 65: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 66: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
 67: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
 68: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 69: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 71: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
 72: #endif
 73: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
 74: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 75: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 76: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 77: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 78: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 79: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 80: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 81: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 82: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 84: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 85: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 86: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 87: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);

 89: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 90: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 92: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 93: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 94: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);

 96: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 97: {
 98:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

100:   PetscFunctionBegin;
101:   switch (op) {
102:   case MAT_CUSPARSE_MULT:
103:     cusparsestruct->format = format;
104:     break;
105:   case MAT_CUSPARSE_ALL:
106:     cusparsestruct->format = format;
107:     break;
108:   default:
109:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110:   }
111:   PetscFunctionReturn(PETSC_SUCCESS);
112: }

114: /*@
115:    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
116:    operation. Only the `MatMult()` operation can use different GPU storage formats

118:    Not Collective

120:    Input Parameters:
121: +  A - Matrix of type `MATSEQAIJCUSPARSE`
122: .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
123:         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
124: -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

126:    Level: intermediate

128: .seealso: [](ch_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129: @*/
130: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131: {
132:   PetscFunctionBegin;
134:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
135:   PetscFunctionReturn(PETSC_SUCCESS);
136: }

138: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139: {
140:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

142:   PetscFunctionBegin;
143:   cusparsestruct->use_cpu_solve = use_cpu;
144:   PetscFunctionReturn(PETSC_SUCCESS);
145: }

147: /*@
148:    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

150:    Input Parameters:
151: +  A - Matrix of type `MATSEQAIJCUSPARSE`
152: -  use_cpu - set flag for using the built-in CPU `MatSolve()`

154:    Level: intermediate

156:    Note:
157:    The cuSparse LU solver currently computes the factors with the built-in CPU method
158:    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159:    This method to specify if the solve is done on the CPU or GPU (GPU is the default).

161: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162: @*/
163: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164: {
165:   PetscFunctionBegin;
167:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
168:   PetscFunctionReturn(PETSC_SUCCESS);
169: }

171: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172: {
173:   PetscFunctionBegin;
174:   switch (op) {
175:   case MAT_FORM_EXPLICIT_TRANSPOSE:
176:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
177:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
178:     A->form_explicit_transpose = flg;
179:     break;
180:   default:
181:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182:     break;
183:   }
184:   PetscFunctionReturn(PETSC_SUCCESS);
185: }

187: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188: {
189:   MatCUSPARSEStorageFormat format;
190:   PetscBool                flg;
191:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

193:   PetscFunctionBegin;
194:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
195:   if (A->factortype == MAT_FACTOR_NONE) {
196:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
197:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

199:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
201:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
202:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
204:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208:   #else
209:     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210:   #endif
211:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

214:     PetscCall(
215:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217: #endif
218:   }
219:   PetscOptionsHeadEnd();
220:   PetscFunctionReturn(PETSC_SUCCESS);
221: }

223: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225: {
226:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227:   PetscInt                      m  = A->rmap->n;
228:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230:   const MatScalar              *Aa = a->a;
231:   PetscInt                     *Mi, *Mj, Mnz;
232:   PetscScalar                  *Ma;

234:   PetscFunctionBegin;
235:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238:       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239:       PetscCall(PetscMalloc1(m + 1, &Mi));
240:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241:       PetscCall(PetscMalloc1(Mnz, &Ma));
242:       Mi[0] = 0;
243:       for (PetscInt i = 0; i < m; i++) {
244:         PetscInt llen = Ai[i + 1] - Ai[i];
245:         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249:         Mi[i + 1] = Mi[i] + llen + ulen;
250:       }
251:       // Copy M (L,U) from host to device
252:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));

258:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

267:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

271:       fillMode = CUSPARSE_FILL_MODE_UPPER;
272:       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

277:       // Allocate work vectors in SpSv
278:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));

281:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

284:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

292:       // Record for reuse
293:       fs->csrRowPtr_h = Mi;
294:       fs->csrVal_h    = Ma;
295:       PetscCall(PetscFree(Mj));
296:     }
297:     // Copy the value
298:     Mi  = fs->csrRowPtr_h;
299:     Ma  = fs->csrVal_h;
300:     Mnz = Mi[m];
301:     for (PetscInt i = 0; i < m; i++) {
302:       PetscInt llen = Ai[i + 1] - Ai[i];
303:       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307:     }
308:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));

310:     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

313:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

315:     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317:   }
318:   PetscFunctionReturn(PETSC_SUCCESS);
319: }
320: #else
321: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322: {
323:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
324:   PetscInt                           n                  = A->rmap->n;
325:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
327:   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
328:   const MatScalar                   *aa = a->a, *v;
329:   PetscInt                          *AiLo, *AjLo;
330:   PetscInt                           i, nz, nzLower, offset, rowOffset;

332:   PetscFunctionBegin;
333:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
335:     try {
336:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
337:       nzLower = n + ai[n] - ai[1];
338:       if (!loTriFactor) {
339:         PetscScalar *AALo;

341:         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));

343:         /* Allocate Space for the lower triangular matrix */
344:         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
345:         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));

347:         /* Fill the lower triangular matrix */
348:         AiLo[0]   = (PetscInt)0;
349:         AiLo[n]   = nzLower;
350:         AjLo[0]   = (PetscInt)0;
351:         AALo[0]   = (MatScalar)1.0;
352:         v         = aa;
353:         vi        = aj;
354:         offset    = 1;
355:         rowOffset = 1;
356:         for (i = 1; i < n; i++) {
357:           nz = ai[i + 1] - ai[i];
358:           /* additional 1 for the term on the diagonal */
359:           AiLo[i] = rowOffset;
360:           rowOffset += nz + 1;

362:           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
363:           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));

365:           offset += nz;
366:           AjLo[offset] = (PetscInt)i;
367:           AALo[offset] = (MatScalar)1.0;
368:           offset += 1;

370:           v += nz;
371:           vi += nz;
372:         }

374:         /* allocate space for the triangular factor information */
375:         PetscCall(PetscNew(&loTriFactor));
376:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377:         /* Create the matrix description */
378:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
379:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
380:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
381:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382:   #else
383:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384:   #endif
385:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
386:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

388:         /* set the operation */
389:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

391:         /* set the matrix */
392:         loTriFactor->csrMat              = new CsrMatrix;
393:         loTriFactor->csrMat->num_rows    = n;
394:         loTriFactor->csrMat->num_cols    = n;
395:         loTriFactor->csrMat->num_entries = nzLower;

397:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);

400:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);

403:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

406:         /* Create the solve analysis information */
407:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
409:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
410:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
411:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
412:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413:   #endif

415:         /* perform the solve analysis */
416:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
417:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
418:         PetscCallCUDA(WaitForCUDA());
419:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

421:         /* assign the pointer */
422:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
423:         loTriFactor->AA_h                                          = AALo;
424:         PetscCallCUDA(cudaFreeHost(AiLo));
425:         PetscCallCUDA(cudaFreeHost(AjLo));
426:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427:       } else { /* update values only */
428:         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429:         /* Fill the lower triangular matrix */
430:         loTriFactor->AA_h[0] = 1.0;
431:         v                    = aa;
432:         vi                   = aj;
433:         offset               = 1;
434:         for (i = 1; i < n; i++) {
435:           nz = ai[i + 1] - ai[i];
436:           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437:           offset += nz;
438:           loTriFactor->AA_h[offset] = 1.0;
439:           offset += 1;
440:           v += nz;
441:         }
442:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
443:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444:       }
445:     } catch (char *ex) {
446:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447:     }
448:   }
449:   PetscFunctionReturn(PETSC_SUCCESS);
450: }

452: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453: {
454:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
455:   PetscInt                           n                  = A->rmap->n;
456:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
458:   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
459:   const MatScalar                   *aa = a->a, *v;
460:   PetscInt                          *AiUp, *AjUp;
461:   PetscInt                           i, nz, nzUpper, offset;

463:   PetscFunctionBegin;
464:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
466:     try {
467:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
468:       nzUpper = adiag[0] - adiag[n];
469:       if (!upTriFactor) {
470:         PetscScalar *AAUp;

472:         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

474:         /* Allocate Space for the upper triangular matrix */
475:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
476:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

478:         /* Fill the upper triangular matrix */
479:         AiUp[0] = (PetscInt)0;
480:         AiUp[n] = nzUpper;
481:         offset  = nzUpper;
482:         for (i = n - 1; i >= 0; i--) {
483:           v  = aa + adiag[i + 1] + 1;
484:           vi = aj + adiag[i + 1] + 1;

486:           /* number of elements NOT on the diagonal */
487:           nz = adiag[i] - adiag[i + 1] - 1;

489:           /* decrement the offset */
490:           offset -= (nz + 1);

492:           /* first, set the diagonal elements */
493:           AjUp[offset] = (PetscInt)i;
494:           AAUp[offset] = (MatScalar)1. / v[nz];
495:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

497:           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
498:           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
499:         }

501:         /* allocate space for the triangular factor information */
502:         PetscCall(PetscNew(&upTriFactor));
503:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

505:         /* Create the matrix description */
506:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
507:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
508:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
509:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510:   #else
511:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512:   #endif
513:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
514:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

516:         /* set the operation */
517:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

519:         /* set the matrix */
520:         upTriFactor->csrMat              = new CsrMatrix;
521:         upTriFactor->csrMat->num_rows    = n;
522:         upTriFactor->csrMat->num_cols    = n;
523:         upTriFactor->csrMat->num_entries = nzUpper;

525:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);

528:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);

531:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

534:         /* Create the solve analysis information */
535:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
537:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
538:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
539:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
540:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541:   #endif

543:         /* perform the solve analysis */
544:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
545:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

547:         PetscCallCUDA(WaitForCUDA());
548:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

550:         /* assign the pointer */
551:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
552:         upTriFactor->AA_h                                          = AAUp;
553:         PetscCallCUDA(cudaFreeHost(AiUp));
554:         PetscCallCUDA(cudaFreeHost(AjUp));
555:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556:       } else {
557:         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558:         /* Fill the upper triangular matrix */
559:         offset = nzUpper;
560:         for (i = n - 1; i >= 0; i--) {
561:           v = aa + adiag[i + 1] + 1;

563:           /* number of elements NOT on the diagonal */
564:           nz = adiag[i] - adiag[i + 1] - 1;

566:           /* decrement the offset */
567:           offset -= (nz + 1);

569:           /* first, set the diagonal elements */
570:           upTriFactor->AA_h[offset] = 1. / v[nz];
571:           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572:         }
573:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
574:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575:       }
576:     } catch (char *ex) {
577:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578:     }
579:   }
580:   PetscFunctionReturn(PETSC_SUCCESS);
581: }
582: #endif

584: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585: {
586:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
587:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
588:   IS                            isrow = a->row, iscol = a->icol;
589:   PetscBool                     row_identity, col_identity;
590:   PetscInt                      n = A->rmap->n;

592:   PetscFunctionBegin;
593:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596: #else
597:   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
598:   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600: #endif

602:   cusparseTriFactors->nnz = a->nz;

604:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605:   /* lower triangular indices */
606:   PetscCall(ISIdentity(isrow, &row_identity));
607:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608:     const PetscInt *r;

610:     PetscCall(ISGetIndices(isrow, &r));
611:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612:     cusparseTriFactors->rpermIndices->assign(r, r + n);
613:     PetscCall(ISRestoreIndices(isrow, &r));
614:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615:   }

617:   /* upper triangular indices */
618:   PetscCall(ISIdentity(iscol, &col_identity));
619:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620:     const PetscInt *c;

622:     PetscCall(ISGetIndices(iscol, &c));
623:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624:     cusparseTriFactors->cpermIndices->assign(c, c + n);
625:     PetscCall(ISRestoreIndices(iscol, &c));
626:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627:   }
628:   PetscFunctionReturn(PETSC_SUCCESS);
629: }

631: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633: {
634:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635:   PetscInt                      m  = A->rmap->n;
636:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638:   const MatScalar              *Aa = a->a;
639:   PetscInt                     *Mj, Mnz;
640:   PetscScalar                  *Ma, *D;

642:   PetscFunctionBegin;
643:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647:       Mnz = Ai[m]; // Unz (with the unit diagonal)
648:       PetscCall(PetscMalloc1(Mnz, &Ma));
649:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651:       for (PetscInt i = 0; i < m; i++) {
652:         PetscInt ulen = Ai[i + 1] - Ai[i];
653:         Mj[Ai[i]]     = i;                                              // diagonal entry
654:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655:       }
656:       // Copy M (U) from host to device
657:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660:       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));

664:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

673:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

677:       // Allocate work vectors in SpSv
678:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));

681:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

684:       // Query buffer sizes for SpSV and then allocate buffers
685:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

689:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

693:       // Record for reuse
694:       fs->csrVal_h = Ma;
695:       fs->diag_h   = D;
696:       PetscCall(PetscFree(Mj));
697:     }
698:     // Copy the value
699:     Ma  = fs->csrVal_h;
700:     D   = fs->diag_h;
701:     Mnz = Ai[m];
702:     for (PetscInt i = 0; i < m; i++) {
703:       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706:     }
707:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708:     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));

710:     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713:   }
714:   PetscFunctionReturn(PETSC_SUCCESS);
715: }

717: // Solve Ut D U x = b
718: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719: {
720:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722:   const PetscScalar                    *barray;
723:   PetscScalar                          *xarray;
724:   thrust::device_ptr<const PetscScalar> bGPU;
725:   thrust::device_ptr<PetscScalar>       xGPU;
726:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727:   PetscInt                              m   = A->rmap->n;

729:   PetscFunctionBegin;
730:   PetscCall(PetscLogGpuTimeBegin());
731:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732:   PetscCall(VecCUDAGetArrayRead(b, &barray));
733:   xGPU = thrust::device_pointer_cast(xarray);
734:   bGPU = thrust::device_pointer_cast(barray);

736:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737:   if (fs->rpermIndices) {
738:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740:   } else {
741:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742:   }

744:   // Solve Ut Y = X
745:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

748:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749:   // It is basically a vector element-wise multiplication, but cublas does not have it!
750:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));

752:   // Solve U X = Y
753:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755:   } else {
756:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757:   }
758:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

760:   // Reorder X with the column permutation if needed, and put the result back to x
761:   if (fs->cpermIndices) {
762:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764:   }

766:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768:   PetscCall(PetscLogGpuTimeEnd());
769:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770:   PetscFunctionReturn(PETSC_SUCCESS);
771: }
772: #else
773: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774: {
775:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779:   PetscInt                          *AiUp, *AjUp;
780:   PetscScalar                       *AAUp;
781:   PetscScalar                       *AALo;
782:   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783:   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784:   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785:   const MatScalar                   *aa = b->a, *v;

787:   PetscFunctionBegin;
788:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790:     try {
791:       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792:       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793:       if (!upTriFactor && !loTriFactor) {
794:         /* Allocate Space for the upper triangular matrix */
795:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

798:         /* Fill the upper triangular matrix */
799:         AiUp[0] = (PetscInt)0;
800:         AiUp[n] = nzUpper;
801:         offset  = 0;
802:         for (i = 0; i < n; i++) {
803:           /* set the pointers */
804:           v  = aa + ai[i];
805:           vj = aj + ai[i];
806:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

808:           /* first, set the diagonal elements */
809:           AjUp[offset] = (PetscInt)i;
810:           AAUp[offset] = (MatScalar)1.0 / v[nz];
811:           AiUp[i]      = offset;
812:           AALo[offset] = (MatScalar)1.0 / v[nz];

814:           offset += 1;
815:           if (nz > 0) {
816:             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
817:             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818:             for (j = offset; j < offset + nz; j++) {
819:               AAUp[j] = -AAUp[j];
820:               AALo[j] = AAUp[j] / v[nz];
821:             }
822:             offset += nz;
823:           }
824:         }

826:         /* allocate space for the triangular factor information */
827:         PetscCall(PetscNew(&upTriFactor));
828:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

830:         /* Create the matrix description */
831:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835:   #else
836:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837:   #endif
838:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

841:         /* set the matrix */
842:         upTriFactor->csrMat              = new CsrMatrix;
843:         upTriFactor->csrMat->num_rows    = A->rmap->n;
844:         upTriFactor->csrMat->num_cols    = A->cmap->n;
845:         upTriFactor->csrMat->num_entries = a->nz;

847:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

850:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

853:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

856:         /* set the operation */
857:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

859:         /* Create the solve analysis information */
860:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866:   #endif

868:         /* perform the solve analysis */
869:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

872:         PetscCallCUDA(WaitForCUDA());
873:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

875:         /* assign the pointer */
876:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

878:         /* allocate space for the triangular factor information */
879:         PetscCall(PetscNew(&loTriFactor));
880:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

882:         /* Create the matrix description */
883:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887:   #else
888:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889:   #endif
890:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

893:         /* set the operation */
894:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

896:         /* set the matrix */
897:         loTriFactor->csrMat              = new CsrMatrix;
898:         loTriFactor->csrMat->num_rows    = A->rmap->n;
899:         loTriFactor->csrMat->num_cols    = A->cmap->n;
900:         loTriFactor->csrMat->num_entries = a->nz;

902:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

905:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

908:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

911:         /* Create the solve analysis information */
912:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918:   #endif

920:         /* perform the solve analysis */
921:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

924:         PetscCallCUDA(WaitForCUDA());
925:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

927:         /* assign the pointer */
928:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

930:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931:         PetscCallCUDA(cudaFreeHost(AiUp));
932:         PetscCallCUDA(cudaFreeHost(AjUp));
933:       } else {
934:         /* Fill the upper triangular matrix */
935:         offset = 0;
936:         for (i = 0; i < n; i++) {
937:           /* set the pointers */
938:           v  = aa + ai[i];
939:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

941:           /* first, set the diagonal elements */
942:           AAUp[offset] = 1.0 / v[nz];
943:           AALo[offset] = 1.0 / v[nz];

945:           offset += 1;
946:           if (nz > 0) {
947:             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948:             for (j = offset; j < offset + nz; j++) {
949:               AAUp[j] = -AAUp[j];
950:               AALo[j] = AAUp[j] / v[nz];
951:             }
952:             offset += nz;
953:           }
954:         }
955:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960:       }
961:       PetscCallCUDA(cudaFreeHost(AAUp));
962:       PetscCallCUDA(cudaFreeHost(AALo));
963:     } catch (char *ex) {
964:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965:     }
966:   }
967:   PetscFunctionReturn(PETSC_SUCCESS);
968: }
969: #endif

971: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972: {
973:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975:   IS                            ip                 = a->row;
976:   PetscBool                     perm_identity;
977:   PetscInt                      n = A->rmap->n;

979:   PetscFunctionBegin;
980:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");

982: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984: #else
985:   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987: #endif
988:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

990:   A->offloadmask = PETSC_OFFLOAD_BOTH;

992:   /* lower triangular indices */
993:   PetscCall(ISIdentity(ip, &perm_identity));
994:   if (!perm_identity) {
995:     IS              iip;
996:     const PetscInt *irip, *rip;

998:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999:     PetscCall(ISGetIndices(iip, &irip));
1000:     PetscCall(ISGetIndices(ip, &rip));
1001:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005:     PetscCall(ISRestoreIndices(iip, &irip));
1006:     PetscCall(ISDestroy(&iip));
1007:     PetscCall(ISRestoreIndices(ip, &rip));
1008:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009:   }
1010:   PetscFunctionReturn(PETSC_SUCCESS);
1011: }

1013: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014: {
1015:   PetscFunctionBegin;
1016:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018:   B->offloadmask = PETSC_OFFLOAD_CPU;

1020: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021:   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022:   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023: #else
1024:   /* determine which version of MatSolve needs to be used. */
1025:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026:   IS          ip = b->row;
1027:   PetscBool   perm_identity;

1029:   PetscCall(ISIdentity(ip, &perm_identity));
1030:   if (perm_identity) {
1031:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033:   } else {
1034:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036:   }
1037: #endif
1038:   B->ops->matsolve          = NULL;
1039:   B->ops->matsolvetranspose = NULL;

1041:   /* get the triangular factors */
1042:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043:   PetscFunctionReturn(PETSC_SUCCESS);
1044: }

1046: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048: {
1049:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054:   cusparseIndexBase_t                indexBase;
1055:   cusparseMatrixType_t               matrixType;
1056:   cusparseFillMode_t                 fillMode;
1057:   cusparseDiagType_t                 diagType;

1059:   PetscFunctionBegin;
1060:   /* allocate space for the transpose of the lower triangular factor */
1061:   PetscCall(PetscNew(&loTriFactorT));
1062:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1064:   /* set the matrix descriptors of the lower triangular factor */
1065:   matrixType = cusparseGetMatType(loTriFactor->descr);
1066:   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1067:   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068:   diagType   = cusparseGetMatDiagType(loTriFactor->descr);

1070:   /* Create the matrix description */
1071:   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072:   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073:   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074:   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075:   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));

1077:   /* set the operation */
1078:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1080:   /* allocate GPU space for the CSC of the lower triangular factor*/
1081:   loTriFactorT->csrMat                 = new CsrMatrix;
1082:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1089:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093:                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094:   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095:   #endif

1097:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098:   {
1099:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103:                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104:   #else
1105:                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106:   #endif
1107:     PetscCallCUSPARSE(stat);
1108:   }

1110:   PetscCallCUDA(WaitForCUDA());
1111:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1113:   /* Create the solve analysis information */
1114:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119:   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120:   #endif

1122:   /* perform the solve analysis */
1123:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1126:   PetscCallCUDA(WaitForCUDA());
1127:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1129:   /* assign the pointer */
1130:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1132:   /*********************************************/
1133:   /* Now the Transpose of the Upper Tri Factor */
1134:   /*********************************************/

1136:   /* allocate space for the transpose of the upper triangular factor */
1137:   PetscCall(PetscNew(&upTriFactorT));
1138:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1140:   /* set the matrix descriptors of the upper triangular factor */
1141:   matrixType = cusparseGetMatType(upTriFactor->descr);
1142:   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1143:   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144:   diagType   = cusparseGetMatDiagType(upTriFactor->descr);

1146:   /* Create the matrix description */
1147:   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148:   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149:   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150:   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151:   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));

1153:   /* set the operation */
1154:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1156:   /* allocate GPU space for the CSC of the upper triangular factor*/
1157:   upTriFactorT->csrMat                 = new CsrMatrix;
1158:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1165:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169:                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170:   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171:   #endif

1173:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174:   {
1175:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179:                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180:   #else
1181:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182:   #endif
1183:     PetscCallCUSPARSE(stat);
1184:   }

1186:   PetscCallCUDA(WaitForCUDA());
1187:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1189:   /* Create the solve analysis information */
1190:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195:   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196:   #endif

1198:   /* perform the solve analysis */
1199:   /* christ, would it have killed you to put this stuff in a function????????? */
1200:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1203:   PetscCallCUDA(WaitForCUDA());
1204:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1206:   /* assign the pointer */
1207:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208:   PetscFunctionReturn(PETSC_SUCCESS);
1209: }
1210: #endif

1212: struct PetscScalarToPetscInt {
1213:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214: };

1216: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217: {
1218:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221:   cusparseStatus_t              stat;
1222:   cusparseIndexBase_t           indexBase;

1224:   PetscFunctionBegin;
1225:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232:   PetscCall(PetscLogGpuTimeBegin());
1233:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1241:     /* set alpha and beta */
1242:     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1243:     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1244:     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1245:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

1249:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250:       CsrMatrix *matrixT      = new CsrMatrix;
1251:       matstructT->mat         = matrixT;
1252:       matrixT->num_rows       = A->cmap->n;
1253:       matrixT->num_cols       = A->rmap->n;
1254:       matrixT->num_entries    = a->nz;
1255:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257:       matrixT->values         = new THRUSTARRAY(a->nz);

1259:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1262: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265:                                indexBase, cusparse_scalartype);
1266:       PetscCallCUSPARSE(stat);
1267:   #else
1268:       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1271:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274:         */
1275:       if (matrixT->num_entries) {
1276:         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277:         PetscCallCUSPARSE(stat);

1279:       } else {
1280:         matstructT->matDescr = NULL;
1281:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282:       }
1283:   #endif
1284: #endif
1285:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288: #else
1289:       CsrMatrix *temp  = new CsrMatrix;
1290:       CsrMatrix *tempT = new CsrMatrix;
1291:       /* First convert HYB to CSR */
1292:       temp->num_rows       = A->rmap->n;
1293:       temp->num_cols       = A->cmap->n;
1294:       temp->num_entries    = a->nz;
1295:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297:       temp->values         = new THRUSTARRAY(a->nz);

1299:       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300:       PetscCallCUSPARSE(stat);

1302:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303:       tempT->num_rows       = A->rmap->n;
1304:       tempT->num_cols       = A->cmap->n;
1305:       tempT->num_entries    = a->nz;
1306:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308:       tempT->values         = new THRUSTARRAY(a->nz);

1310:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311:                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312:       PetscCallCUSPARSE(stat);

1314:       /* Last, convert CSC to HYB */
1315:       cusparseHybMat_t hybMat;
1316:       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317:       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318:       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319:       PetscCallCUSPARSE(stat);

1321:       /* assign the pointer */
1322:       matstructT->mat = hybMat;
1323:       A->transupdated = PETSC_TRUE;
1324:       /* delete temporaries */
1325:       if (tempT) {
1326:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329:         delete (CsrMatrix *)tempT;
1330:       }
1331:       if (temp) {
1332:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335:         delete (CsrMatrix *)temp;
1336:       }
1337: #endif
1338:     }
1339:   }
1340:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355:     }
1356:     if (!cusparsestruct->csr2csc_i) {
1357:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1360:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362:       void  *csr2cscBuffer;
1363:       size_t csr2cscBufferSize;
1364:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366:       PetscCallCUSPARSE(stat);
1367:       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368: #endif

1370:       if (matrix->num_entries) {
1371:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1375:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376:            should be filled with indexBase. So I just take a shortcut here.
1377:         */
1378:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381:         PetscCallCUSPARSE(stat);
1382: #else
1383:                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384:         PetscCallCUSPARSE(stat);
1385: #endif
1386:       } else {
1387:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388:       }

1390:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393:       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394: #endif
1395:     }
1396:     PetscCallThrust(
1397:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398:   }
1399:   PetscCall(PetscLogGpuTimeEnd());
1400:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401:   /* the compressed row indices is not used for matTranspose */
1402:   matstructT->cprowIndices = NULL;
1403:   /* assign the pointer */
1404:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405:   A->transupdated                                = PETSC_TRUE;
1406:   PetscFunctionReturn(PETSC_SUCCESS);
1407: }

1409: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411: {
1412:   const PetscScalar                    *barray;
1413:   PetscScalar                          *xarray;
1414:   thrust::device_ptr<const PetscScalar> bGPU;
1415:   thrust::device_ptr<PetscScalar>       xGPU;
1416:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418:   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420:   PetscInt                              m   = A->rmap->n;

1422:   PetscFunctionBegin;
1423:   PetscCall(PetscLogGpuTimeBegin());
1424:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426:   xGPU = thrust::device_pointer_cast(xarray);
1427:   bGPU = thrust::device_pointer_cast(barray);

1429:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430:   if (fs->rpermIndices) {
1431:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433:   } else {
1434:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435:   }

1437:   // Solve L Y = X
1438:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439:   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));

1442:   // Solve U X = Y
1443:   if (fs->cpermIndices) {
1444:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445:   } else {
1446:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447:   }
1448:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

1450:   // Reorder X with the column permutation if needed, and put the result back to x
1451:   if (fs->cpermIndices) {
1452:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454:   }
1455:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457:   PetscCall(PetscLogGpuTimeEnd());
1458:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459:   PetscFunctionReturn(PETSC_SUCCESS);
1460: }

1462: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463: {
1464:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466:   const PetscScalar                    *barray;
1467:   PetscScalar                          *xarray;
1468:   thrust::device_ptr<const PetscScalar> bGPU;
1469:   thrust::device_ptr<PetscScalar>       xGPU;
1470:   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472:   PetscInt                              m   = A->rmap->n;

1474:   PetscFunctionBegin;
1475:   PetscCall(PetscLogGpuTimeBegin());
1476:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1481:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486:   }

1488:   if (!fs->updatedTransposeSpSVAnalysis) {
1489:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1491:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493:   }

1495:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497:   xGPU = thrust::device_pointer_cast(xarray);
1498:   bGPU = thrust::device_pointer_cast(barray);

1500:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501:   if (fs->rpermIndices) {
1502:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504:   } else {
1505:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506:   }

1508:   // Solve Ut Y = X
1509:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

1512:   // Solve Lt X = Y
1513:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515:   } else {
1516:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517:   }
1518:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));

1520:   // Reorder X with the column permutation if needed, and put the result back to x
1521:   if (fs->cpermIndices) {
1522:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524:   }

1526:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528:   PetscCall(PetscLogGpuTimeEnd());
1529:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530:   PetscFunctionReturn(PETSC_SUCCESS);
1531: }
1532: #else
1533: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535: {
1536:   PetscInt                              n = xx->map->n;
1537:   const PetscScalar                    *barray;
1538:   PetscScalar                          *xarray;
1539:   thrust::device_ptr<const PetscScalar> bGPU;
1540:   thrust::device_ptr<PetscScalar>       xGPU;
1541:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1546:   PetscFunctionBegin;
1547:   /* Analyze the matrix and create the transpose ... on the fly */
1548:   if (!loTriFactorT && !upTriFactorT) {
1549:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552:   }

1554:   /* Get the GPU pointers */
1555:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557:   xGPU = thrust::device_pointer_cast(xarray);
1558:   bGPU = thrust::device_pointer_cast(barray);

1560:   PetscCall(PetscLogGpuTimeBegin());
1561:   /* First, reorder with the row permutation */
1562:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);

1564:   /* First, solve U */
1565:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1568:   /* Then, solve L */
1569:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1572:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());

1575:   /* Copy the temporary to the full solution. */
1576:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);

1578:   /* restore */
1579:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581:   PetscCall(PetscLogGpuTimeEnd());
1582:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583:   PetscFunctionReturn(PETSC_SUCCESS);
1584: }

1586: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587: {
1588:   const PetscScalar                 *barray;
1589:   PetscScalar                       *xarray;
1590:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1595:   PetscFunctionBegin;
1596:   /* Analyze the matrix and create the transpose ... on the fly */
1597:   if (!loTriFactorT && !upTriFactorT) {
1598:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601:   }

1603:   /* Get the GPU pointers */
1604:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1607:   PetscCall(PetscLogGpuTimeBegin());
1608:   /* First, solve U */
1609:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1612:   /* Then, solve L */
1613:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1616:   /* restore */
1617:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619:   PetscCall(PetscLogGpuTimeEnd());
1620:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621:   PetscFunctionReturn(PETSC_SUCCESS);
1622: }

1624: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625: {
1626:   const PetscScalar                    *barray;
1627:   PetscScalar                          *xarray;
1628:   thrust::device_ptr<const PetscScalar> bGPU;
1629:   thrust::device_ptr<PetscScalar>       xGPU;
1630:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1635:   PetscFunctionBegin;
1636:   /* Get the GPU pointers */
1637:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639:   xGPU = thrust::device_pointer_cast(xarray);
1640:   bGPU = thrust::device_pointer_cast(barray);

1642:   PetscCall(PetscLogGpuTimeBegin());
1643:   /* First, reorder with the row permutation */
1644:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());

1646:   /* Next, solve L */
1647:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1650:   /* Then, solve U */
1651:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1654:   /* Last, reorder with the column permutation */
1655:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);

1657:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659:   PetscCall(PetscLogGpuTimeEnd());
1660:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661:   PetscFunctionReturn(PETSC_SUCCESS);
1662: }

1664: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665: {
1666:   const PetscScalar                 *barray;
1667:   PetscScalar                       *xarray;
1668:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1673:   PetscFunctionBegin;
1674:   /* Get the GPU pointers */
1675:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1678:   PetscCall(PetscLogGpuTimeBegin());
1679:   /* First, solve L */
1680:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1683:   /* Next, solve U */
1684:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1687:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689:   PetscCall(PetscLogGpuTimeEnd());
1690:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691:   PetscFunctionReturn(PETSC_SUCCESS);
1692: }
1693: #endif

1695: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1696: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697: {
1698:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701:   CsrMatrix                    *Acsr;
1702:   PetscInt                      m, nz;
1703:   PetscBool                     flg;

1705:   PetscFunctionBegin;
1706:   if (PetscDefined(USE_DEBUG)) {
1707:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709:   }

1711:   /* Copy A's value to fact */
1712:   m  = fact->rmap->n;
1713:   nz = aij->nz;
1714:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1718:   /* Factorize fact inplace */
1719:   if (m)
1720:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722:   if (PetscDefined(USE_DEBUG)) {
1723:     int              numerical_zero;
1724:     cusparseStatus_t status;
1725:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727:   }

1729:   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1730:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1731:   */
1732:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1734:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1736:   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1737:   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;

1739:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1740:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742:   fact->ops->matsolve          = NULL;
1743:   fact->ops->matsolvetranspose = NULL;
1744:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1745:   PetscFunctionReturn(PETSC_SUCCESS);
1746: }

1748: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749: {
1750:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1752:   PetscInt                      m, nz;

1754:   PetscFunctionBegin;
1755:   if (PetscDefined(USE_DEBUG)) {
1756:     PetscInt  i;
1757:     PetscBool flg, missing;

1759:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1763:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764:   }

1766:   /* Free the old stale stuff */
1767:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1769:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770:      but they will not be used. Allocate them just for easy debugging.
1771:    */
1772:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1774:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1775:   fact->factortype             = MAT_FACTOR_ILU;
1776:   fact->info.factor_mallocs    = 0;
1777:   fact->info.fill_ratio_given  = info->fill;
1778:   fact->info.fill_ratio_needed = 1.0;

1780:   aij->row = NULL;
1781:   aij->col = NULL;

1783:   /* ====================================================================== */
1784:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1785:   /* We'll do in-place factorization on fact                                */
1786:   /* ====================================================================== */
1787:   const int *Ai, *Aj;

1789:   m  = fact->rmap->n;
1790:   nz = aij->nz;

1792:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1796:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1799:   /* ====================================================================== */
1800:   /* Create descriptors for M, L, U                                         */
1801:   /* ====================================================================== */
1802:   cusparseFillMode_t fillMode;
1803:   cusparseDiagType_t diagType;

1805:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1809:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814:   */
1815:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1816:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1818:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1819:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1821:   fillMode = CUSPARSE_FILL_MODE_UPPER;
1822:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1827:   /* ========================================================================= */
1828:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1829:   /* ========================================================================= */
1830:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1831:   if (m)
1832:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

1835:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1838:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1841:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1842:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1844:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1845:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1847:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1848:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1849:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1850:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851:    */
1852:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1853:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1854:     fs->spsvBuffer_L = fs->factBuffer_M;
1855:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1856:   } else {
1857:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1858:     fs->spsvBuffer_U = fs->factBuffer_M;
1859:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1860:   }

1862:   /* ========================================================================== */
1863:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1864:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865:   /* ========================================================================== */
1866:   int              structural_zero;
1867:   cusparseStatus_t status;

1869:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1870:   if (m)
1871:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873:   if (PetscDefined(USE_DEBUG)) {
1874:     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877:   }

1879:   /* Estimate FLOPs of the numeric factorization */
1880:   {
1881:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1882:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1883:     PetscLogDouble flops = 0.0;

1885:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886:     Ai    = Aseq->i;
1887:     Adiag = Aseq->diag;
1888:     for (PetscInt i = 0; i < m; i++) {
1889:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890:         nzRow  = Ai[i + 1] - Ai[i];
1891:         nzLeft = Adiag[i] - Ai[i];
1892:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894:         */
1895:         nzLeft = (nzRow - 1) / 2;
1896:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897:       }
1898:     }
1899:     fs->numericFactFlops = flops;
1900:   }
1901:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1902:   PetscFunctionReturn(PETSC_SUCCESS);
1903: }

1905: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906: {
1907:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1909:   const PetscScalar            *barray;
1910:   PetscScalar                  *xarray;

1912:   PetscFunctionBegin;
1913:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1915:   PetscCall(PetscLogGpuTimeBegin());

1917:   /* Solve L*y = b */
1918:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1920:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1921:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1923:   /* Solve Lt*x = y */
1924:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1925:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1926:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1928:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1931:   PetscCall(PetscLogGpuTimeEnd());
1932:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1933:   PetscFunctionReturn(PETSC_SUCCESS);
1934: }

1936: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937: {
1938:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1940:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941:   CsrMatrix                    *Acsr;
1942:   PetscInt                      m, nz;
1943:   PetscBool                     flg;

1945:   PetscFunctionBegin;
1946:   if (PetscDefined(USE_DEBUG)) {
1947:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949:   }

1951:   /* Copy A's value to fact */
1952:   m  = fact->rmap->n;
1953:   nz = aij->nz;
1954:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1956:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1958:   /* Factorize fact inplace */
1959:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960:      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964:    */
1965:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966:   if (PetscDefined(USE_DEBUG)) {
1967:     int              numerical_zero;
1968:     cusparseStatus_t status;
1969:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971:   }

1973:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1975:   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977:   */
1978:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1980:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1981:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1982:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1983:   fact->ops->matsolve          = NULL;
1984:   fact->ops->matsolvetranspose = NULL;
1985:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1986:   PetscFunctionReturn(PETSC_SUCCESS);
1987: }

1989: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990: {
1991:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1993:   PetscInt                      m, nz;

1995:   PetscFunctionBegin;
1996:   if (PetscDefined(USE_DEBUG)) {
1997:     PetscInt  i;
1998:     PetscBool flg, missing;

2000:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003:     PetscCall(MatMissingDiagonal(A, &missing, &i));
2004:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005:   }

2007:   /* Free the old stale stuff */
2008:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

2010:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011:      but they will not be used. Allocate them just for easy debugging.
2012:    */
2013:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

2015:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2016:   fact->factortype             = MAT_FACTOR_ICC;
2017:   fact->info.factor_mallocs    = 0;
2018:   fact->info.fill_ratio_given  = info->fill;
2019:   fact->info.fill_ratio_needed = 1.0;

2021:   aij->row = NULL;
2022:   aij->col = NULL;

2024:   /* ====================================================================== */
2025:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2026:   /* We'll do in-place factorization on fact                                */
2027:   /* ====================================================================== */
2028:   const int *Ai, *Aj;

2030:   m  = fact->rmap->n;
2031:   nz = aij->nz;

2033:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

2040:   /* ====================================================================== */
2041:   /* Create mat descriptors for M, L                                        */
2042:   /* ====================================================================== */
2043:   cusparseFillMode_t fillMode;
2044:   cusparseDiagType_t diagType;

2046:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

2050:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055:   */
2056:   fillMode = CUSPARSE_FILL_MODE_LOWER;
2057:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2059:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2060:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

2062:   /* ========================================================================= */
2063:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2064:   /* ========================================================================= */
2065:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

2068:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

2071:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

2074:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2075:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

2077:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2078:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

2080:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2081:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2082:    */
2083:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2084:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2085:     fs->spsvBuffer_L = fs->factBuffer_M;
2086:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2087:   } else {
2088:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2089:     fs->spsvBuffer_Lt = fs->factBuffer_M;
2090:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2091:   }

2093:   /* ========================================================================== */
2094:   /* Perform analysis of ic0 on M                                               */
2095:   /* The lower triangular part of M has the same sparsity pattern as L          */
2096:   /* ========================================================================== */
2097:   int              structural_zero;
2098:   cusparseStatus_t status;

2100:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102:   if (PetscDefined(USE_DEBUG)) {
2103:     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106:   }

2108:   /* Estimate FLOPs of the numeric factorization */
2109:   {
2110:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2111:     PetscInt      *Ai, nzRow, nzLeft;
2112:     PetscLogDouble flops = 0.0;

2114:     Ai = Aseq->i;
2115:     for (PetscInt i = 0; i < m; i++) {
2116:       nzRow = Ai[i + 1] - Ai[i];
2117:       if (nzRow > 1) {
2118:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120:         */
2121:         nzLeft = (nzRow - 1) / 2;
2122:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123:       }
2124:     }
2125:     fs->numericFactFlops = flops;
2126:   }
2127:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2128:   PetscFunctionReturn(PETSC_SUCCESS);
2129: }
2130: #endif

2132: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133: {
2134:   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2135:   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);

2137:   PetscFunctionBegin;
2138:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2139:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2140:   B->offloadmask = PETSC_OFFLOAD_CPU;

2142:   if (!cusparsestruct->use_cpu_solve) {
2143: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2144:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2145:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2146: #else
2147:     /* determine which version of MatSolve needs to be used. */
2148:     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2149:     IS          isrow = b->row, iscol = b->col;
2150:     PetscBool   row_identity, col_identity;

2152:     PetscCall(ISIdentity(isrow, &row_identity));
2153:     PetscCall(ISIdentity(iscol, &col_identity));
2154:     if (row_identity && col_identity) {
2155:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2156:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2157:     } else {
2158:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2159:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2160:     }
2161: #endif
2162:   }
2163:   B->ops->matsolve          = NULL;
2164:   B->ops->matsolvetranspose = NULL;

2166:   /* get the triangular factors */
2167:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2168:   PetscFunctionReturn(PETSC_SUCCESS);
2169: }

2171: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2172: {
2173:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);

2175:   PetscFunctionBegin;
2176:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2177:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2178:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2179:   PetscFunctionReturn(PETSC_SUCCESS);
2180: }

2182: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2183: {
2184:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2186:   PetscFunctionBegin;
2187: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2188:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2189:   if (cusparseTriFactors->factorizeOnDevice) {
2190:     PetscCall(ISIdentity(isrow, &row_identity));
2191:     PetscCall(ISIdentity(iscol, &col_identity));
2192:   }
2193:   if (!info->levels && row_identity && col_identity) {
2194:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2195:   } else
2196: #endif
2197:   {
2198:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2199:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2200:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2201:   }
2202:   PetscFunctionReturn(PETSC_SUCCESS);
2203: }

2205: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2206: {
2207:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2209:   PetscFunctionBegin;
2210: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2211:   PetscBool perm_identity = PETSC_FALSE;
2212:   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2213:   if (!info->levels && perm_identity) {
2214:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2215:   } else
2216: #endif
2217:   {
2218:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2219:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2220:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2221:   }
2222:   PetscFunctionReturn(PETSC_SUCCESS);
2223: }

2225: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2226: {
2227:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2229:   PetscFunctionBegin;
2230:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2231:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2232:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2233:   PetscFunctionReturn(PETSC_SUCCESS);
2234: }

2236: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2237: {
2238:   PetscFunctionBegin;
2239:   *type = MATSOLVERCUSPARSE;
2240:   PetscFunctionReturn(PETSC_SUCCESS);
2241: }

2243: /*MC
2244:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2245:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2246:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2247:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2248:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2249:   algorithms are not recommended. This class does NOT support direct solver operations.

2251:   Level: beginner

2253: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2254:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2255: M*/

2257: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2258: {
2259:   PetscInt  n = A->rmap->n;
2260:   PetscBool factOnDevice, factOnHost;
2261:   char     *prefix;
2262:   char      factPlace[32] = "device"; /* the default */

2264:   PetscFunctionBegin;
2265:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2266:   PetscCall(MatSetSizes(*B, n, n, n, n));
2267:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2268:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

2270:   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2271:   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2272:   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2273:   PetscOptionsEnd();
2274:   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2275:   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2276:   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2277:   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;

2279:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2280:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2281:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2282:     if (!A->boundtocpu) {
2283:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2284:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2285:     } else {
2286:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2287:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2288:     }
2289:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2290:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2291:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2292:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2293:     if (!A->boundtocpu) {
2294:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2295:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2296:     } else {
2297:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2298:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2299:     }
2300:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2301:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2302:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

2304:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2305:   (*B)->canuseordering = PETSC_TRUE;
2306:   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2307:   PetscFunctionReturn(PETSC_SUCCESS);
2308: }

2310: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2311: {
2312:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2313:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2314: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2315:   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2316: #endif

2318:   PetscFunctionBegin;
2319:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2320:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2321:     if (A->factortype == MAT_FACTOR_NONE) {
2322:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2323:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2324:     }
2325: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326:     else if (fs->csrVal) {
2327:       /* We have a factorized matrix on device and are able to copy it to host */
2328:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329:     }
2330: #endif
2331:     else
2332:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2333:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2334:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2335:     A->offloadmask = PETSC_OFFLOAD_BOTH;
2336:   }
2337:   PetscFunctionReturn(PETSC_SUCCESS);
2338: }

2340: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2341: {
2342:   PetscFunctionBegin;
2343:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2344:   *array = ((Mat_SeqAIJ *)A->data)->a;
2345:   PetscFunctionReturn(PETSC_SUCCESS);
2346: }

2348: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2349: {
2350:   PetscFunctionBegin;
2351:   A->offloadmask = PETSC_OFFLOAD_CPU;
2352:   *array         = NULL;
2353:   PetscFunctionReturn(PETSC_SUCCESS);
2354: }

2356: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2357: {
2358:   PetscFunctionBegin;
2359:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2360:   *array = ((Mat_SeqAIJ *)A->data)->a;
2361:   PetscFunctionReturn(PETSC_SUCCESS);
2362: }

2364: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2365: {
2366:   PetscFunctionBegin;
2367:   *array = NULL;
2368:   PetscFunctionReturn(PETSC_SUCCESS);
2369: }

2371: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2372: {
2373:   PetscFunctionBegin;
2374:   *array = ((Mat_SeqAIJ *)A->data)->a;
2375:   PetscFunctionReturn(PETSC_SUCCESS);
2376: }

2378: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379: {
2380:   PetscFunctionBegin;
2381:   A->offloadmask = PETSC_OFFLOAD_CPU;
2382:   *array         = NULL;
2383:   PetscFunctionReturn(PETSC_SUCCESS);
2384: }

2386: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2387: {
2388:   Mat_SeqAIJCUSPARSE *cusp;
2389:   CsrMatrix          *matrix;

2391:   PetscFunctionBegin;
2392:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2393:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2394:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2395:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2396:   matrix = (CsrMatrix *)cusp->mat->mat;

2398:   if (i) {
2399: #if !defined(PETSC_USE_64BIT_INDICES)
2400:     *i = matrix->row_offsets->data().get();
2401: #else
2402:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2403: #endif
2404:   }
2405:   if (j) {
2406: #if !defined(PETSC_USE_64BIT_INDICES)
2407:     *j = matrix->column_indices->data().get();
2408: #else
2409:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410: #endif
2411:   }
2412:   if (a) *a = matrix->values->data().get();
2413:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2414:   PetscFunctionReturn(PETSC_SUCCESS);
2415: }

2417: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2418: {
2419:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2420:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2421:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2422:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2423:   cusparseStatus_t              stat;
2424:   PetscBool                     both = PETSC_TRUE;

2426:   PetscFunctionBegin;
2427:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2428:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2429:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2430:       CsrMatrix *matrix;
2431:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

2433:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2434:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2435:       matrix->values->assign(a->a, a->a + a->nz);
2436:       PetscCallCUDA(WaitForCUDA());
2437:       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2438:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2439:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2440:     } else {
2441:       PetscInt nnz;
2442:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2443:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2444:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2445:       delete cusparsestruct->workVector;
2446:       delete cusparsestruct->rowoffsets_gpu;
2447:       cusparsestruct->workVector     = NULL;
2448:       cusparsestruct->rowoffsets_gpu = NULL;
2449:       try {
2450:         if (a->compressedrow.use) {
2451:           m    = a->compressedrow.nrows;
2452:           ii   = a->compressedrow.i;
2453:           ridx = a->compressedrow.rindex;
2454:         } else {
2455:           m    = A->rmap->n;
2456:           ii   = a->i;
2457:           ridx = NULL;
2458:         }
2459:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2460:         if (!a->a) {
2461:           nnz  = ii[m];
2462:           both = PETSC_FALSE;
2463:         } else nnz = a->nz;
2464:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2466:         /* create cusparse matrix */
2467:         cusparsestruct->nrows = m;
2468:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2469:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2470:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2471:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

2473:         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2474:         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2475:         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2476:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2477:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

2481:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2482:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2483:           /* set the matrix */
2484:           CsrMatrix *mat   = new CsrMatrix;
2485:           mat->num_rows    = m;
2486:           mat->num_cols    = A->cmap->n;
2487:           mat->num_entries = nnz;
2488:           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2489:           mat->row_offsets->assign(ii, ii + m + 1);

2491:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2492:           mat->column_indices->assign(a->j, a->j + nnz);

2494:           mat->values = new THRUSTARRAY(nnz);
2495:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2497:           /* assign the pointer */
2498:           matstruct->mat = mat;
2499: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503:             PetscCallCUSPARSE(stat);
2504:           }
2505: #endif
2506:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509: #else
2510:           CsrMatrix *mat   = new CsrMatrix;
2511:           mat->num_rows    = m;
2512:           mat->num_cols    = A->cmap->n;
2513:           mat->num_entries = nnz;
2514:           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2515:           mat->row_offsets->assign(ii, ii + m + 1);

2517:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2518:           mat->column_indices->assign(a->j, a->j + nnz);

2520:           mat->values = new THRUSTARRAY(nnz);
2521:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2523:           cusparseHybMat_t hybMat;
2524:           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525:           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526:           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527:           PetscCallCUSPARSE(stat);
2528:           /* assign the pointer */
2529:           matstruct->mat = hybMat;

2531:           if (mat) {
2532:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535:             delete (CsrMatrix *)mat;
2536:           }
2537: #endif
2538:         }

2540:         /* assign the compressed row indices */
2541:         if (a->compressedrow.use) {
2542:           cusparsestruct->workVector = new THRUSTARRAY(m);
2543:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2544:           matstruct->cprowIndices->assign(ridx, ridx + m);
2545:           tmp = m;
2546:         } else {
2547:           cusparsestruct->workVector = NULL;
2548:           matstruct->cprowIndices    = NULL;
2549:           tmp                        = 0;
2550:         }
2551:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2553:         /* assign the pointer */
2554:         cusparsestruct->mat = matstruct;
2555:       } catch (char *ex) {
2556:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557:       }
2558:       PetscCallCUDA(WaitForCUDA());
2559:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560:       cusparsestruct->nonzerostate = A->nonzerostate;
2561:     }
2562:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563:   }
2564:   PetscFunctionReturn(PETSC_SUCCESS);
2565: }

2567: struct VecCUDAPlusEquals {
2568:   template <typename Tuple>
2569:   __host__ __device__ void operator()(Tuple t)
2570:   {
2571:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572:   }
2573: };

2575: struct VecCUDAEquals {
2576:   template <typename Tuple>
2577:   __host__ __device__ void operator()(Tuple t)
2578:   {
2579:     thrust::get<1>(t) = thrust::get<0>(t);
2580:   }
2581: };

2583: struct VecCUDAEqualsReverse {
2584:   template <typename Tuple>
2585:   __host__ __device__ void operator()(Tuple t)
2586:   {
2587:     thrust::get<0>(t) = thrust::get<1>(t);
2588:   }
2589: };

2591: struct MatMatCusparse {
2592:   PetscBool      cisdense;
2593:   PetscScalar   *Bt;
2594:   Mat            X;
2595:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596:   PetscLogDouble flops;
2597:   CsrMatrix     *Bcsr;

2599: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600:   cusparseSpMatDescr_t matSpBDescr;
2601:   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602:   cusparseDnMatDescr_t matBDescr;
2603:   cusparseDnMatDescr_t matCDescr;
2604:   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606:   void *dBuffer4;
2607:   void *dBuffer5;
2608:   #endif
2609:   size_t                mmBufferSize;
2610:   void                 *mmBuffer;
2611:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612:   cusparseSpGEMMDescr_t spgemmDesc;
2613: #endif
2614: };

2616: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617: {
2618:   MatMatCusparse *mmdata = (MatMatCusparse *)data;

2620:   PetscFunctionBegin;
2621:   PetscCallCUDA(cudaFree(mmdata->Bt));
2622:   delete mmdata->Bcsr;
2623: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629:   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630:   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631:   #endif
2632:   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633:   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634: #endif
2635:   PetscCall(MatDestroy(&mmdata->X));
2636:   PetscCall(PetscFree(data));
2637:   PetscFunctionReturn(PETSC_SUCCESS);
2638: }

2640: #include <../src/mat/impls/dense/seq/dense.h>

2642: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643: {
2644:   Mat_Product                  *product = C->product;
2645:   Mat                           A, B;
2646:   PetscInt                      m, n, blda, clda;
2647:   PetscBool                     flg, biscuda;
2648:   Mat_SeqAIJCUSPARSE           *cusp;
2649:   cusparseStatus_t              stat;
2650:   cusparseOperation_t           opA;
2651:   const PetscScalar            *barray;
2652:   PetscScalar                  *carray;
2653:   MatMatCusparse               *mmdata;
2654:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655:   CsrMatrix                    *csrmat;

2657:   PetscFunctionBegin;
2658:   MatCheckProduct(C, 1);
2659:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660:   mmdata = (MatMatCusparse *)product->data;
2661:   A      = product->A;
2662:   B      = product->B;
2663:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2667:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670:   switch (product->type) {
2671:   case MATPRODUCT_AB:
2672:   case MATPRODUCT_PtAP:
2673:     mat = cusp->mat;
2674:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675:     m   = A->rmap->n;
2676:     n   = B->cmap->n;
2677:     break;
2678:   case MATPRODUCT_AtB:
2679:     if (!A->form_explicit_transpose) {
2680:       mat = cusp->mat;
2681:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682:     } else {
2683:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684:       mat = cusp->matTranspose;
2685:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686:     }
2687:     m = A->cmap->n;
2688:     n = B->cmap->n;
2689:     break;
2690:   case MATPRODUCT_ABt:
2691:   case MATPRODUCT_RARt:
2692:     mat = cusp->mat;
2693:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694:     m   = A->rmap->n;
2695:     n   = B->rmap->n;
2696:     break;
2697:   default:
2698:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699:   }
2700:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701:   csrmat = (CsrMatrix *)mat->mat;
2702:   /* if the user passed a CPU matrix, copy the data to the GPU */
2703:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

2707:   PetscCall(MatDenseGetLDA(B, &blda));
2708:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711:   } else {
2712:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713:     PetscCall(MatDenseGetLDA(C, &clda));
2714:   }

2716:   PetscCall(PetscLogGpuTimeBegin());
2717: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2720:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2721:     size_t mmBufferSize;
2722:     if (mmdata->initialized && mmdata->Blda != blda) {
2723:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2724:       mmdata->matBDescr = NULL;
2725:     }
2726:     if (!mmdata->matBDescr) {
2727:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2728:       mmdata->Blda = blda;
2729:     }

2731:     if (mmdata->initialized && mmdata->Clda != clda) {
2732:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2733:       mmdata->matCDescr = NULL;
2734:     }
2735:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2736:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737:       mmdata->Clda = clda;
2738:     }

2740:     if (!mat->matDescr) {
2741:       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2742:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2743:       PetscCallCUSPARSE(stat);
2744:     }
2745:     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2746:     PetscCallCUSPARSE(stat);
2747:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2748:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2749:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2750:       mmdata->mmBufferSize = mmBufferSize;
2751:     }
2752:     mmdata->initialized = PETSC_TRUE;
2753:   } else {
2754:     /* to be safe, always update pointers of the mats */
2755:     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2756:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2757:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2758:   }

2760:   /* do cusparseSpMM, which supports transpose on B */
2761:   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2762:   PetscCallCUSPARSE(stat);
2763: #else
2764:   PetscInt k;
2765:   /* cusparseXcsrmm does not support transpose on B */
2766:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2767:     cublasHandle_t cublasv2handle;
2768:     cublasStatus_t cerr;

2770:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2771:     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2772:     PetscCallCUBLAS(cerr);
2773:     blda = B->cmap->n;
2774:     k    = B->cmap->n;
2775:   } else {
2776:     k = B->rmap->n;
2777:   }

2779:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2780:   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2781:   PetscCallCUSPARSE(stat);
2782: #endif
2783:   PetscCall(PetscLogGpuTimeEnd());
2784:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2785:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2786:   if (product->type == MATPRODUCT_RARt) {
2787:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2788:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2789:   } else if (product->type == MATPRODUCT_PtAP) {
2790:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2791:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2792:   } else {
2793:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2794:   }
2795:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2796:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2797:   PetscFunctionReturn(PETSC_SUCCESS);
2798: }

2800: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2801: {
2802:   Mat_Product        *product = C->product;
2803:   Mat                 A, B;
2804:   PetscInt            m, n;
2805:   PetscBool           cisdense, flg;
2806:   MatMatCusparse     *mmdata;
2807:   Mat_SeqAIJCUSPARSE *cusp;

2809:   PetscFunctionBegin;
2810:   MatCheckProduct(C, 1);
2811:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2812:   A = product->A;
2813:   B = product->B;
2814:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2815:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2816:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2817:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2818:   switch (product->type) {
2819:   case MATPRODUCT_AB:
2820:     m = A->rmap->n;
2821:     n = B->cmap->n;
2822:     break;
2823:   case MATPRODUCT_AtB:
2824:     m = A->cmap->n;
2825:     n = B->cmap->n;
2826:     break;
2827:   case MATPRODUCT_ABt:
2828:     m = A->rmap->n;
2829:     n = B->rmap->n;
2830:     break;
2831:   case MATPRODUCT_PtAP:
2832:     m = B->cmap->n;
2833:     n = B->cmap->n;
2834:     break;
2835:   case MATPRODUCT_RARt:
2836:     m = B->rmap->n;
2837:     n = B->rmap->n;
2838:     break;
2839:   default:
2840:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2841:   }
2842:   PetscCall(MatSetSizes(C, m, n, m, n));
2843:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2844:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2845:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

2847:   /* product data */
2848:   PetscCall(PetscNew(&mmdata));
2849:   mmdata->cisdense = cisdense;
2850: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2851:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2852:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2853: #endif
2854:   /* for these products we need intermediate storage */
2855:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2856:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2857:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2858:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2859:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2860:     } else {
2861:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2862:     }
2863:   }
2864:   C->product->data    = mmdata;
2865:   C->product->destroy = MatDestroy_MatMatCusparse;

2867:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2868:   PetscFunctionReturn(PETSC_SUCCESS);
2869: }

2871: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2872: {
2873:   Mat_Product                  *product = C->product;
2874:   Mat                           A, B;
2875:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2876:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2877:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2878:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2879:   PetscBool                     flg;
2880:   cusparseStatus_t              stat;
2881:   MatProductType                ptype;
2882:   MatMatCusparse               *mmdata;
2883: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2884:   cusparseSpMatDescr_t BmatSpDescr;
2885: #endif
2886:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2888:   PetscFunctionBegin;
2889:   MatCheckProduct(C, 1);
2890:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2891:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2892:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2893:   mmdata = (MatMatCusparse *)C->product->data;
2894:   A      = product->A;
2895:   B      = product->B;
2896:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2897:     mmdata->reusesym = PETSC_FALSE;
2898:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2899:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2900:     Cmat = Ccusp->mat;
2901:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2902:     Ccsr = (CsrMatrix *)Cmat->mat;
2903:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2904:     goto finalize;
2905:   }
2906:   if (!c->nz) goto finalize;
2907:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2908:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2909:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2910:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2911:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2913:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2914:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2915:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2916:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2917:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2918:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2919:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2920:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

2922:   ptype = product->type;
2923:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2924:     ptype = MATPRODUCT_AB;
2925:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2926:   }
2927:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2928:     ptype = MATPRODUCT_AB;
2929:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2930:   }
2931:   switch (ptype) {
2932:   case MATPRODUCT_AB:
2933:     Amat = Acusp->mat;
2934:     Bmat = Bcusp->mat;
2935:     break;
2936:   case MATPRODUCT_AtB:
2937:     Amat = Acusp->matTranspose;
2938:     Bmat = Bcusp->mat;
2939:     break;
2940:   case MATPRODUCT_ABt:
2941:     Amat = Acusp->mat;
2942:     Bmat = Bcusp->matTranspose;
2943:     break;
2944:   default:
2945:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2946:   }
2947:   Cmat = Ccusp->mat;
2948:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2949:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2950:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2951:   Acsr = (CsrMatrix *)Amat->mat;
2952:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2953:   Ccsr = (CsrMatrix *)Cmat->mat;
2954:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2955:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2956:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957:   PetscCall(PetscLogGpuTimeBegin());
2958: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2959:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2960:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2961:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2962:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2963:   PetscCallCUSPARSE(stat);
2964:   #else
2965:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2966:   PetscCallCUSPARSE(stat);
2967:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2968:   PetscCallCUSPARSE(stat);
2969:   #endif
2970: #else
2971:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2972:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2973:   PetscCallCUSPARSE(stat);
2974: #endif
2975:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2976:   PetscCallCUDA(WaitForCUDA());
2977:   PetscCall(PetscLogGpuTimeEnd());
2978:   C->offloadmask = PETSC_OFFLOAD_GPU;
2979: finalize:
2980:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2981:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2982:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2983:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2984:   c->reallocs = 0;
2985:   C->info.mallocs += 0;
2986:   C->info.nz_unneeded = 0;
2987:   C->assembled = C->was_assembled = PETSC_TRUE;
2988:   C->num_ass++;
2989:   PetscFunctionReturn(PETSC_SUCCESS);
2990: }

2992: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2993: {
2994:   Mat_Product                  *product = C->product;
2995:   Mat                           A, B;
2996:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2997:   Mat_SeqAIJ                   *a, *b, *c;
2998:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2999:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3000:   PetscInt                      i, j, m, n, k;
3001:   PetscBool                     flg;
3002:   cusparseStatus_t              stat;
3003:   MatProductType                ptype;
3004:   MatMatCusparse               *mmdata;
3005:   PetscLogDouble                flops;
3006:   PetscBool                     biscompressed, ciscompressed;
3007: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3008:   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3009:   cusparseSpMatDescr_t BmatSpDescr;
3010: #else
3011:   int cnz;
3012: #endif
3013:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

3015:   PetscFunctionBegin;
3016:   MatCheckProduct(C, 1);
3017:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3018:   A = product->A;
3019:   B = product->B;
3020:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3021:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3022:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3023:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3024:   a = (Mat_SeqAIJ *)A->data;
3025:   b = (Mat_SeqAIJ *)B->data;
3026:   /* product data */
3027:   PetscCall(PetscNew(&mmdata));
3028:   C->product->data    = mmdata;
3029:   C->product->destroy = MatDestroy_MatMatCusparse;

3031:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3032:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3033:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3034:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3035:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

3038:   ptype = product->type;
3039:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3040:     ptype                                          = MATPRODUCT_AB;
3041:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3042:   }
3043:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3044:     ptype                                          = MATPRODUCT_AB;
3045:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3046:   }
3047:   biscompressed = PETSC_FALSE;
3048:   ciscompressed = PETSC_FALSE;
3049:   switch (ptype) {
3050:   case MATPRODUCT_AB:
3051:     m    = A->rmap->n;
3052:     n    = B->cmap->n;
3053:     k    = A->cmap->n;
3054:     Amat = Acusp->mat;
3055:     Bmat = Bcusp->mat;
3056:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3057:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3058:     break;
3059:   case MATPRODUCT_AtB:
3060:     m = A->cmap->n;
3061:     n = B->cmap->n;
3062:     k = A->rmap->n;
3063:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3064:     Amat = Acusp->matTranspose;
3065:     Bmat = Bcusp->mat;
3066:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3067:     break;
3068:   case MATPRODUCT_ABt:
3069:     m = A->rmap->n;
3070:     n = B->rmap->n;
3071:     k = A->cmap->n;
3072:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3073:     Amat = Acusp->mat;
3074:     Bmat = Bcusp->matTranspose;
3075:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3076:     break;
3077:   default:
3078:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3079:   }

3081:   /* create cusparse matrix */
3082:   PetscCall(MatSetSizes(C, m, n, m, n));
3083:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3084:   c     = (Mat_SeqAIJ *)C->data;
3085:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3086:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3087:   Ccsr  = new CsrMatrix;

3089:   c->compressedrow.use = ciscompressed;
3090:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3091:     c->compressedrow.nrows = a->compressedrow.nrows;
3092:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3093:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3094:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3095:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3096:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3097:   } else {
3098:     c->compressedrow.nrows  = 0;
3099:     c->compressedrow.i      = NULL;
3100:     c->compressedrow.rindex = NULL;
3101:     Ccusp->workVector       = NULL;
3102:     Cmat->cprowIndices      = NULL;
3103:   }
3104:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3105:   Ccusp->mat        = Cmat;
3106:   Ccusp->mat->mat   = Ccsr;
3107:   Ccsr->num_rows    = Ccusp->nrows;
3108:   Ccsr->num_cols    = n;
3109:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3110:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3111:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3112:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3113:   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
3114:   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
3115:   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3116:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3117:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3119:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3120:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3121:     c->nz                = 0;
3122:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3123:     Ccsr->values         = new THRUSTARRAY(c->nz);
3124:     goto finalizesym;
3125:   }

3127:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3128:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3129:   Acsr = (CsrMatrix *)Amat->mat;
3130:   if (!biscompressed) {
3131:     Bcsr = (CsrMatrix *)Bmat->mat;
3132: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3133:     BmatSpDescr = Bmat->matDescr;
3134: #endif
3135:   } else { /* we need to use row offsets for the full matrix */
3136:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3137:     Bcsr                 = new CsrMatrix;
3138:     Bcsr->num_rows       = B->rmap->n;
3139:     Bcsr->num_cols       = cBcsr->num_cols;
3140:     Bcsr->num_entries    = cBcsr->num_entries;
3141:     Bcsr->column_indices = cBcsr->column_indices;
3142:     Bcsr->values         = cBcsr->values;
3143:     if (!Bcusp->rowoffsets_gpu) {
3144:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3145:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3146:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3147:     }
3148:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3149:     mmdata->Bcsr      = Bcsr;
3150: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3151:     if (Bcsr->num_rows && Bcsr->num_cols) {
3152:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3153:       PetscCallCUSPARSE(stat);
3154:     }
3155:     BmatSpDescr = mmdata->matSpBDescr;
3156: #endif
3157:   }
3158:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3159:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3160:   /* precompute flops count */
3161:   if (ptype == MATPRODUCT_AB) {
3162:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3163:       const PetscInt st = a->i[i];
3164:       const PetscInt en = a->i[i + 1];
3165:       for (j = st; j < en; j++) {
3166:         const PetscInt brow = a->j[j];
3167:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3168:       }
3169:     }
3170:   } else if (ptype == MATPRODUCT_AtB) {
3171:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3172:       const PetscInt anzi = a->i[i + 1] - a->i[i];
3173:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3174:       flops += (2. * anzi) * bnzi;
3175:     }
3176:   } else { /* TODO */
3177:     flops = 0.;
3178:   }

3180:   mmdata->flops = flops;
3181:   PetscCall(PetscLogGpuTimeBegin());

3183: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3185:   // cuda-12.2 requires non-null csrRowOffsets
3186:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3187:   PetscCallCUSPARSE(stat);
3188:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3189:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3190:   {
3191:     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3192:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3193:   */
3194:     void *dBuffer1 = NULL;
3195:     void *dBuffer2 = NULL;
3196:     void *dBuffer3 = NULL;
3197:     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3198:     size_t bufferSize1 = 0;
3199:     size_t bufferSize2 = 0;
3200:     size_t bufferSize3 = 0;
3201:     size_t bufferSize4 = 0;
3202:     size_t bufferSize5 = 0;

3204:     /* ask bufferSize1 bytes for external memory */
3205:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3206:     PetscCallCUSPARSE(stat);
3207:     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3208:     /* inspect the matrices A and B to understand the memory requirement for the next step */
3209:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3210:     PetscCallCUSPARSE(stat);

3212:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3213:     PetscCallCUSPARSE(stat);
3214:     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3215:     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3216:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3217:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3218:     PetscCallCUSPARSE(stat);
3219:     PetscCallCUDA(cudaFree(dBuffer1));
3220:     PetscCallCUDA(cudaFree(dBuffer2));

3222:     /* get matrix C non-zero entries C_nnz1 */
3223:     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3224:     c->nz = (PetscInt)C_nnz1;
3225:     /* allocate matrix C */
3226:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3227:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3228:     Ccsr->values = new THRUSTARRAY(c->nz);
3229:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3230:     /* update matC with the new pointers */
3231:     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3232:     PetscCallCUSPARSE(stat);

3234:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3235:     PetscCallCUSPARSE(stat);
3236:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3237:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3238:     PetscCallCUSPARSE(stat);
3239:     PetscCallCUDA(cudaFree(dBuffer3));
3240:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3241:     PetscCallCUSPARSE(stat);
3242:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3243:   }
3244:   #else
3245:   size_t bufSize2;
3246:   /* ask bufferSize bytes for external memory */
3247:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3248:   PetscCallCUSPARSE(stat);
3249:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3250:   /* inspect the matrices A and B to understand the memory requirement for the next step */
3251:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3252:   PetscCallCUSPARSE(stat);
3253:   /* ask bufferSize again bytes for external memory */
3254:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3255:   PetscCallCUSPARSE(stat);
3256:   /* The CUSPARSE documentation is not clear, nor the API
3257:      We need both buffers to perform the operations properly!
3258:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3259:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3260:      is stored in the descriptor! What a messy API... */
3261:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3262:   /* compute the intermediate product of A * B */
3263:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3264:   PetscCallCUSPARSE(stat);
3265:   /* get matrix C non-zero entries C_nnz1 */
3266:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3267:   c->nz = (PetscInt)C_nnz1;
3268:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3269:                       mmdata->mmBufferSize / 1024));
3270:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3271:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3272:   Ccsr->values = new THRUSTARRAY(c->nz);
3273:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3274:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3275:   PetscCallCUSPARSE(stat);
3276:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3277:   PetscCallCUSPARSE(stat);
3278:   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3279: #else
3280:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3281:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3282:                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3283:   PetscCallCUSPARSE(stat);
3284:   c->nz                = cnz;
3285:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3286:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3287:   Ccsr->values = new THRUSTARRAY(c->nz);
3288:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

3290:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3291:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3292:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3293:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3294:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3295:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3296:   PetscCallCUSPARSE(stat);
3297: #endif
3298:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3299:   PetscCall(PetscLogGpuTimeEnd());
3300: finalizesym:
3301:   c->singlemalloc = PETSC_FALSE;
3302:   c->free_a       = PETSC_TRUE;
3303:   c->free_ij      = PETSC_TRUE;
3304:   PetscCall(PetscMalloc1(m + 1, &c->i));
3305:   PetscCall(PetscMalloc1(c->nz, &c->j));
3306:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3307:     PetscInt      *d_i = c->i;
3308:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3309:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3310:     ii = *Ccsr->row_offsets;
3311:     jj = *Ccsr->column_indices;
3312:     if (ciscompressed) d_i = c->compressedrow.i;
3313:     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3314:     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3315:   } else {
3316:     PetscInt *d_i = c->i;
3317:     if (ciscompressed) d_i = c->compressedrow.i;
3318:     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3319:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3320:   }
3321:   if (ciscompressed) { /* need to expand host row offsets */
3322:     PetscInt r = 0;
3323:     c->i[0]    = 0;
3324:     for (k = 0; k < c->compressedrow.nrows; k++) {
3325:       const PetscInt next = c->compressedrow.rindex[k];
3326:       const PetscInt old  = c->compressedrow.i[k];
3327:       for (; r < next; r++) c->i[r + 1] = old;
3328:     }
3329:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3330:   }
3331:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3332:   PetscCall(PetscMalloc1(m, &c->ilen));
3333:   PetscCall(PetscMalloc1(m, &c->imax));
3334:   c->maxnz         = c->nz;
3335:   c->nonzerorowcnt = 0;
3336:   c->rmax          = 0;
3337:   for (k = 0; k < m; k++) {
3338:     const PetscInt nn = c->i[k + 1] - c->i[k];
3339:     c->ilen[k] = c->imax[k] = nn;
3340:     c->nonzerorowcnt += (PetscInt) !!nn;
3341:     c->rmax = PetscMax(c->rmax, nn);
3342:   }
3343:   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3344:   PetscCall(PetscMalloc1(c->nz, &c->a));
3345:   Ccsr->num_entries = c->nz;

3347:   C->nonzerostate++;
3348:   PetscCall(PetscLayoutSetUp(C->rmap));
3349:   PetscCall(PetscLayoutSetUp(C->cmap));
3350:   Ccusp->nonzerostate = C->nonzerostate;
3351:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3352:   C->preallocated     = PETSC_TRUE;
3353:   C->assembled        = PETSC_FALSE;
3354:   C->was_assembled    = PETSC_FALSE;
3355:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3356:     mmdata->reusesym = PETSC_TRUE;
3357:     C->offloadmask   = PETSC_OFFLOAD_GPU;
3358:   }
3359:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3360:   PetscFunctionReturn(PETSC_SUCCESS);
3361: }

3363: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

3365: /* handles sparse or dense B */
3366: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3367: {
3368:   Mat_Product *product = mat->product;
3369:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

3371:   PetscFunctionBegin;
3372:   MatCheckProduct(mat, 1);
3373:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3374:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3375:   if (product->type == MATPRODUCT_ABC) {
3376:     Ciscusp = PETSC_FALSE;
3377:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3378:   }
3379:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3380:     PetscBool usecpu = PETSC_FALSE;
3381:     switch (product->type) {
3382:     case MATPRODUCT_AB:
3383:       if (product->api_user) {
3384:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3385:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3386:         PetscOptionsEnd();
3387:       } else {
3388:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3389:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3390:         PetscOptionsEnd();
3391:       }
3392:       break;
3393:     case MATPRODUCT_AtB:
3394:       if (product->api_user) {
3395:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3396:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3397:         PetscOptionsEnd();
3398:       } else {
3399:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3400:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3401:         PetscOptionsEnd();
3402:       }
3403:       break;
3404:     case MATPRODUCT_PtAP:
3405:       if (product->api_user) {
3406:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3407:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3408:         PetscOptionsEnd();
3409:       } else {
3410:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3411:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3412:         PetscOptionsEnd();
3413:       }
3414:       break;
3415:     case MATPRODUCT_RARt:
3416:       if (product->api_user) {
3417:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3418:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3419:         PetscOptionsEnd();
3420:       } else {
3421:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3422:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3423:         PetscOptionsEnd();
3424:       }
3425:       break;
3426:     case MATPRODUCT_ABC:
3427:       if (product->api_user) {
3428:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3429:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3430:         PetscOptionsEnd();
3431:       } else {
3432:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3433:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3434:         PetscOptionsEnd();
3435:       }
3436:       break;
3437:     default:
3438:       break;
3439:     }
3440:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3441:   }
3442:   /* dispatch */
3443:   if (isdense) {
3444:     switch (product->type) {
3445:     case MATPRODUCT_AB:
3446:     case MATPRODUCT_AtB:
3447:     case MATPRODUCT_ABt:
3448:     case MATPRODUCT_PtAP:
3449:     case MATPRODUCT_RARt:
3450:       if (product->A->boundtocpu) {
3451:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3452:       } else {
3453:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3454:       }
3455:       break;
3456:     case MATPRODUCT_ABC:
3457:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3458:       break;
3459:     default:
3460:       break;
3461:     }
3462:   } else if (Biscusp && Ciscusp) {
3463:     switch (product->type) {
3464:     case MATPRODUCT_AB:
3465:     case MATPRODUCT_AtB:
3466:     case MATPRODUCT_ABt:
3467:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3468:       break;
3469:     case MATPRODUCT_PtAP:
3470:     case MATPRODUCT_RARt:
3471:     case MATPRODUCT_ABC:
3472:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3473:       break;
3474:     default:
3475:       break;
3476:     }
3477:   } else { /* fallback for AIJ */
3478:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3479:   }
3480:   PetscFunctionReturn(PETSC_SUCCESS);
3481: }

3483: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3484: {
3485:   PetscFunctionBegin;
3486:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3487:   PetscFunctionReturn(PETSC_SUCCESS);
3488: }

3490: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3491: {
3492:   PetscFunctionBegin;
3493:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3494:   PetscFunctionReturn(PETSC_SUCCESS);
3495: }

3497: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3498: {
3499:   PetscFunctionBegin;
3500:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3501:   PetscFunctionReturn(PETSC_SUCCESS);
3502: }

3504: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3505: {
3506:   PetscFunctionBegin;
3507:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3508:   PetscFunctionReturn(PETSC_SUCCESS);
3509: }

3511: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3512: {
3513:   PetscFunctionBegin;
3514:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3515:   PetscFunctionReturn(PETSC_SUCCESS);
3516: }

3518: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3519: {
3520:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3521:   if (i < n) y[idx[i]] += x[i];
3522: }

3524: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3525: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3526: {
3527:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3528:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3529:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3530:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3531:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3532:   PetscBool                     compressed;
3533: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3534:   PetscInt nx, ny;
3535: #endif

3537:   PetscFunctionBegin;
3538:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3539:   if (!a->nz) {
3540:     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3541:     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3542:     PetscFunctionReturn(PETSC_SUCCESS);
3543:   }
3544:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3545:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3546:   if (!trans) {
3547:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3548:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3549:   } else {
3550:     if (herm || !A->form_explicit_transpose) {
3551:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3552:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3553:     } else {
3554:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3555:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3556:     }
3557:   }
3558:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3559:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3561:   try {
3562:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3563:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3564:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3566:     PetscCall(PetscLogGpuTimeBegin());
3567:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3568:       /* z = A x + beta y.
3569:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3570:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3571:       */
3572:       xptr = xarray;
3573:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3574:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3575: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3576:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3577:           allocated to accommodate different uses. So we get the length info directly from mat.
3578:        */
3579:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3580:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3581:         nx             = mat->num_cols;
3582:         ny             = mat->num_rows;
3583:       }
3584: #endif
3585:     } else {
3586:       /* z = A^T x + beta y
3587:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3588:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3589:        */
3590:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3591:       dptr = zarray;
3592:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3593:       if (compressed) { /* Scatter x to work vector */
3594:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

3596:         thrust::for_each(
3597: #if PetscDefined(HAVE_THRUST_ASYNC)
3598:           thrust::cuda::par.on(PetscDefaultCudaStream),
3599: #endif
3600:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3601:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3602:       }
3603: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3604:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3605:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3606:         nx             = mat->num_rows;
3607:         ny             = mat->num_cols;
3608:       }
3609: #endif
3610:     }

3612:     /* csr_spmv does y = alpha op(A) x + beta y */
3613:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3614: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3615:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3616:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3617:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3618:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3619:         PetscCallCUSPARSE(
3620:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3621:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));

3623:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3624:       } else {
3625:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3626:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3627:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3628:       }

3630:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3631:                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3632: #else
3633:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3634:       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3635: #endif
3636:     } else {
3637:       if (cusparsestruct->nrows) {
3638: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3639:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3640: #else
3641:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3642:         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3643: #endif
3644:       }
3645:     }
3646:     PetscCall(PetscLogGpuTimeEnd());

3648:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3649:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3650:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3651:           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3652:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3653:           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3654:         }
3655:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3656:         PetscCall(VecSeq_CUDA::Set(zz, 0));
3657:       }

3659:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3660:       if (compressed) {
3661:         PetscCall(PetscLogGpuTimeBegin());
3662:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3663:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3664:            prevent that. So I just add a ScatterAdd kernel.
3665:          */
3666: #if 0
3667:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3668:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3669:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3670:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3671:                          VecCUDAPlusEquals());
3672: #else
3673:         PetscInt n = matstruct->cprowIndices->size();
3674:         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3675: #endif
3676:         PetscCall(PetscLogGpuTimeEnd());
3677:       }
3678:     } else {
3679:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3680:     }
3681:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3682:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3683:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3684:   } catch (char *ex) {
3685:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3686:   }
3687:   if (yy) {
3688:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3689:   } else {
3690:     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3691:   }
3692:   PetscFunctionReturn(PETSC_SUCCESS);
3693: }

3695: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3696: {
3697:   PetscFunctionBegin;
3698:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3699:   PetscFunctionReturn(PETSC_SUCCESS);
3700: }

3702: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3703: {
3704:   PetscObjectState    onnz = A->nonzerostate;
3705:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

3707:   PetscFunctionBegin;
3708:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3709:   if (onnz != A->nonzerostate && cusp->deviceMat) {
3710:     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3711:     PetscCallCUDA(cudaFree(cusp->deviceMat));
3712:     cusp->deviceMat = NULL;
3713:   }
3714:   PetscFunctionReturn(PETSC_SUCCESS);
3715: }

3717: /*@
3718:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3719:    (the default parallel PETSc format). This matrix will ultimately pushed down
3720:    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3721:    assembly performance the user should preallocate the matrix storage by setting
3722:    the parameter `nz` (or the array `nnz`).

3724:    Collective

3726:    Input Parameters:
3727: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3728: .  m - number of rows
3729: .  n - number of columns
3730: .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3731: -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

3733:    Output Parameter:
3734: .  A - the matrix

3736:    Level: intermediate

3738:    Notes:
3739:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3740:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3741:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3743:    The AIJ format, also called
3744:    compressed row storage, is fully compatible with standard Fortran
3745:    storage.  That is, the stored row and column indices can begin at
3746:    either one (as in Fortran) or zero.

3748:    Specify the preallocated storage with either nz or nnz (not both).
3749:    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3750:    allocation.

3752: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3753: @*/
3754: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3755: {
3756:   PetscFunctionBegin;
3757:   PetscCall(MatCreate(comm, A));
3758:   PetscCall(MatSetSizes(*A, m, n, m, n));
3759:   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3760:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3761:   PetscFunctionReturn(PETSC_SUCCESS);
3762: }

3764: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3765: {
3766:   PetscFunctionBegin;
3767:   if (A->factortype == MAT_FACTOR_NONE) {
3768:     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3769:   } else {
3770:     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3771:   }
3772:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3773:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3774:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3775:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3776:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3777:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3778:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3779:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3780:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3781:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3782:   PetscCall(MatDestroy_SeqAIJ(A));
3783:   PetscFunctionReturn(PETSC_SUCCESS);
3784: }

3786: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3787: static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3788: static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3789: {
3790:   PetscFunctionBegin;
3791:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3792:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3793:   PetscFunctionReturn(PETSC_SUCCESS);
3794: }

3796: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3797: {
3798:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3799:   Mat_SeqAIJCUSPARSE *cy;
3800:   Mat_SeqAIJCUSPARSE *cx;
3801:   PetscScalar        *ay;
3802:   const PetscScalar  *ax;
3803:   CsrMatrix          *csry, *csrx;

3805:   PetscFunctionBegin;
3806:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3807:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3808:   if (X->ops->axpy != Y->ops->axpy) {
3809:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3810:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3811:     PetscFunctionReturn(PETSC_SUCCESS);
3812:   }
3813:   /* if we are here, it means both matrices are bound to GPU */
3814:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3815:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3816:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3817:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3818:   csry = (CsrMatrix *)cy->mat->mat;
3819:   csrx = (CsrMatrix *)cx->mat->mat;
3820:   /* see if we can turn this into a cublas axpy */
3821:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3822:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3823:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3824:     if (eq) str = SAME_NONZERO_PATTERN;
3825:   }
3826:   /* spgeam is buggy with one column */
3827:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3829:   if (str == SUBSET_NONZERO_PATTERN) {
3830:     PetscScalar b = 1.0;
3831: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3832:     size_t bufferSize;
3833:     void  *buffer;
3834: #endif

3836:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3837:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3838:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3839: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3840:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3841:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3842:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3843:     PetscCall(PetscLogGpuTimeBegin());
3844:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3845:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3846:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3847:     PetscCall(PetscLogGpuTimeEnd());
3848:     PetscCallCUDA(cudaFree(buffer));
3849: #else
3850:     PetscCall(PetscLogGpuTimeBegin());
3851:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3852:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3853:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3854:     PetscCall(PetscLogGpuTimeEnd());
3855: #endif
3856:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3857:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3858:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3859:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3860:   } else if (str == SAME_NONZERO_PATTERN) {
3861:     cublasHandle_t cublasv2handle;
3862:     PetscBLASInt   one = 1, bnz = 1;

3864:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3865:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3866:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3867:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3868:     PetscCall(PetscLogGpuTimeBegin());
3869:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3870:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3871:     PetscCall(PetscLogGpuTimeEnd());
3872:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3873:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3874:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3875:   } else {
3876:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3877:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3878:   }
3879:   PetscFunctionReturn(PETSC_SUCCESS);
3880: }

3882: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3883: {
3884:   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3885:   PetscScalar   *ay;
3886:   cublasHandle_t cublasv2handle;
3887:   PetscBLASInt   one = 1, bnz = 1;

3889:   PetscFunctionBegin;
3890:   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3891:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3892:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3893:   PetscCall(PetscLogGpuTimeBegin());
3894:   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3895:   PetscCall(PetscLogGpuFlops(bnz));
3896:   PetscCall(PetscLogGpuTimeEnd());
3897:   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3898:   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3899:   PetscFunctionReturn(PETSC_SUCCESS);
3900: }

3902: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3903: {
3904:   PetscBool   both = PETSC_FALSE;
3905:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3907:   PetscFunctionBegin;
3908:   if (A->factortype == MAT_FACTOR_NONE) {
3909:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3910:     if (spptr->mat) {
3911:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3912:       if (matrix->values) {
3913:         both = PETSC_TRUE;
3914:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3915:       }
3916:     }
3917:     if (spptr->matTranspose) {
3918:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3919:       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3920:     }
3921:   }
3922:   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3923:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3924:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3925:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3926:   PetscFunctionReturn(PETSC_SUCCESS);
3927: }

3929: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3930: {
3931:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3933:   PetscFunctionBegin;
3934:   if (A->factortype != MAT_FACTOR_NONE) {
3935:     A->boundtocpu = flg;
3936:     PetscFunctionReturn(PETSC_SUCCESS);
3937:   }
3938:   if (flg) {
3939:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

3941:     A->ops->scale                     = MatScale_SeqAIJ;
3942:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3943:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3944:     A->ops->mult                      = MatMult_SeqAIJ;
3945:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3946:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3947:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3948:     A->ops->multhermitiantranspose    = NULL;
3949:     A->ops->multhermitiantransposeadd = NULL;
3950:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3951:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3952:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3953:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3954:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3955:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3956:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3957:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3958:   } else {
3959:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3960:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3961:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3962:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3963:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3964:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3965:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3966:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3967:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3968:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3969:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3970:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3971:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3972:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3973:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3974:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3975:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

3977:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3978:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3979:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3980:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3981:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3982:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3983:   }
3984:   A->boundtocpu = flg;
3985:   if (flg && a->inode.size) {
3986:     a->inode.use = PETSC_TRUE;
3987:   } else {
3988:     a->inode.use = PETSC_FALSE;
3989:   }
3990:   PetscFunctionReturn(PETSC_SUCCESS);
3991: }

3993: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3994: {
3995:   Mat B;

3997:   PetscFunctionBegin;
3998:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3999:   if (reuse == MAT_INITIAL_MATRIX) {
4000:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4001:   } else if (reuse == MAT_REUSE_MATRIX) {
4002:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4003:   }
4004:   B = *newmat;

4006:   PetscCall(PetscFree(B->defaultvectype));
4007:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

4009:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4010:     if (B->factortype == MAT_FACTOR_NONE) {
4011:       Mat_SeqAIJCUSPARSE *spptr;
4012:       PetscCall(PetscNew(&spptr));
4013:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4014:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4015:       spptr->format = MAT_CUSPARSE_CSR;
4016: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4017:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4018:       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4019:   #else
4020:       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4021:   #endif
4022:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4023:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4024: #endif
4025:       B->spptr = spptr;
4026:     } else {
4027:       Mat_SeqAIJCUSPARSETriFactors *spptr;

4029:       PetscCall(PetscNew(&spptr));
4030:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4031:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4032:       B->spptr = spptr;
4033:     }
4034:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4035:   }
4036:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4037:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4038:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4039:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4040:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4041:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

4043:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4044:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4045:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4046: #if defined(PETSC_HAVE_HYPRE)
4047:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4048: #endif
4049:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4050:   PetscFunctionReturn(PETSC_SUCCESS);
4051: }

4053: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4054: {
4055:   PetscFunctionBegin;
4056:   PetscCall(MatCreate_SeqAIJ(B));
4057:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4058:   PetscFunctionReturn(PETSC_SUCCESS);
4059: }

4061: /*MC
4062:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

4064:    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
4065:    CSR, ELL, or Hybrid format.
4066:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

4068:    Options Database Keys:
4069: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4070: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4071:                                       Other options include ell (ellpack) or hyb (hybrid).
4072: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4073: -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU

4075:   Level: beginner

4077: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4078: M*/

4080: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);

4082: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4083: {
4084:   PetscFunctionBegin;
4085:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
4086:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4087:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4088:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4089:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));

4091:   PetscFunctionReturn(PETSC_SUCCESS);
4092: }

4094: static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4095: {
4096:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;

4098:   PetscFunctionBegin;
4099:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4100:   delete cusp->cooPerm;
4101:   delete cusp->cooPerm_a;
4102:   cusp->cooPerm   = NULL;
4103:   cusp->cooPerm_a = NULL;
4104:   if (cusp->use_extended_coo) {
4105:     PetscCallCUDA(cudaFree(cusp->jmap_d));
4106:     PetscCallCUDA(cudaFree(cusp->perm_d));
4107:   }
4108:   cusp->use_extended_coo = PETSC_FALSE;
4109:   PetscFunctionReturn(PETSC_SUCCESS);
4110: }

4112: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4113: {
4114:   PetscFunctionBegin;
4115:   if (*cusparsestruct) {
4116:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
4117:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
4118:     delete (*cusparsestruct)->workVector;
4119:     delete (*cusparsestruct)->rowoffsets_gpu;
4120:     delete (*cusparsestruct)->cooPerm;
4121:     delete (*cusparsestruct)->cooPerm_a;
4122:     delete (*cusparsestruct)->csr2csc_i;
4123:     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4124:     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4125:     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4126:     PetscCall(PetscFree(*cusparsestruct));
4127:   }
4128:   PetscFunctionReturn(PETSC_SUCCESS);
4129: }

4131: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4132: {
4133:   PetscFunctionBegin;
4134:   if (*mat) {
4135:     delete (*mat)->values;
4136:     delete (*mat)->column_indices;
4137:     delete (*mat)->row_offsets;
4138:     delete *mat;
4139:     *mat = 0;
4140:   }
4141:   PetscFunctionReturn(PETSC_SUCCESS);
4142: }

4144: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4145: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4146: {
4147:   PetscFunctionBegin;
4148:   if (*trifactor) {
4149:     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4150:     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4151:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4152:     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4153:     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4154:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4155:     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4156:   #endif
4157:     PetscCall(PetscFree(*trifactor));
4158:   }
4159:   PetscFunctionReturn(PETSC_SUCCESS);
4160: }
4161: #endif

4163: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4164: {
4165:   CsrMatrix *mat;

4167:   PetscFunctionBegin;
4168:   if (*matstruct) {
4169:     if ((*matstruct)->mat) {
4170:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4171: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4172:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4173: #else
4174:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4175:         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4176: #endif
4177:       } else {
4178:         mat = (CsrMatrix *)(*matstruct)->mat;
4179:         PetscCall(CsrMatrix_Destroy(&mat));
4180:       }
4181:     }
4182:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4183:     delete (*matstruct)->cprowIndices;
4184:     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4185:     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4186:     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));

4188: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4189:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4190:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4191:     for (int i = 0; i < 3; i++) {
4192:       if (mdata->cuSpMV[i].initialized) {
4193:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4194:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4195:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4196:       }
4197:     }
4198: #endif
4199:     delete *matstruct;
4200:     *matstruct = NULL;
4201:   }
4202:   PetscFunctionReturn(PETSC_SUCCESS);
4203: }

4205: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4206: {
4207:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

4209:   PetscFunctionBegin;
4210:   if (fs) {
4211: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4212:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4213:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4214:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4215:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4216:     delete fs->workVector;
4217:     fs->workVector = NULL;
4218: #endif
4219:     delete fs->rpermIndices;
4220:     delete fs->cpermIndices;
4221:     fs->rpermIndices = NULL;
4222:     fs->cpermIndices = NULL;
4223:     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
4224:     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
4225:     fs->init_dev_prop = PETSC_FALSE;
4226: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4227:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4228:     PetscCallCUDA(cudaFree(fs->csrColIdx));
4229:     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4230:     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4231:     PetscCallCUDA(cudaFree(fs->csrVal));
4232:     PetscCallCUDA(cudaFree(fs->diag));
4233:     PetscCallCUDA(cudaFree(fs->X));
4234:     PetscCallCUDA(cudaFree(fs->Y));
4235:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4236:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4237:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4238:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4239:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4240:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4241:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4242:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4243:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4244:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4245:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4246:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4247:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4248:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4249:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4250:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4251:     PetscCall(PetscFree(fs->csrRowPtr_h));
4252:     PetscCall(PetscFree(fs->csrVal_h));
4253:     PetscCall(PetscFree(fs->diag_h));
4254:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4255:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4256: #endif
4257:   }
4258:   PetscFunctionReturn(PETSC_SUCCESS);
4259: }

4261: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4262: {
4263:   PetscFunctionBegin;
4264:   if (*trifactors) {
4265:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4266:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4267:     PetscCall(PetscFree(*trifactors));
4268:   }
4269:   PetscFunctionReturn(PETSC_SUCCESS);
4270: }

4272: struct IJCompare {
4273:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4274:   {
4275:     if (t1.get<0>() < t2.get<0>()) return true;
4276:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4277:     return false;
4278:   }
4279: };

4281: struct IJEqual {
4282:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4283:   {
4284:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4285:     return true;
4286:   }
4287: };

4289: struct IJDiff {
4290:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
4291: };

4293: struct IJSum {
4294:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
4295: };

4297: #include <thrust/iterator/discard_iterator.h>
4298: /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4299: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4300: {
4301:   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4302:   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4303:   THRUSTARRAY                          *cooPerm_v = NULL;
4304:   thrust::device_ptr<const PetscScalar> d_v;
4305:   CsrMatrix                            *matrix;
4306:   PetscInt                              n;

4308:   PetscFunctionBegin;
4309:   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4310:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4311:   if (!cusp->cooPerm) {
4312:     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4313:     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4314:     PetscFunctionReturn(PETSC_SUCCESS);
4315:   }
4316:   matrix = (CsrMatrix *)cusp->mat->mat;
4317:   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4318:   if (!v) {
4319:     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4320:     goto finalize;
4321:   }
4322:   n = cusp->cooPerm->size();
4323:   if (isCudaMem(v)) {
4324:     d_v = thrust::device_pointer_cast(v);
4325:   } else {
4326:     cooPerm_v = new THRUSTARRAY(n);
4327:     cooPerm_v->assign(v, v + n);
4328:     d_v = cooPerm_v->data();
4329:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4330:   }
4331:   PetscCall(PetscLogGpuTimeBegin());
4332:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4333:     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4334:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4335:       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4336:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4337:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4338:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4339:       */
4340:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4341:       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4342:       delete cooPerm_w;
4343:     } else {
4344:       /* all nonzeros in d_v[] are unique entries */
4345:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4346:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4347:       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4348:     }
4349:   } else {
4350:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4351:       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4352:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4353:     } else {
4354:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4355:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4356:       thrust::for_each(zibit, zieit, VecCUDAEquals());
4357:     }
4358:   }
4359:   PetscCall(PetscLogGpuTimeEnd());
4360: finalize:
4361:   delete cooPerm_v;
4362:   A->offloadmask = PETSC_OFFLOAD_GPU;
4363:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4364:   /* shorter version of MatAssemblyEnd_SeqAIJ */
4365:   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4366:   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4367:   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4368:   a->reallocs = 0;
4369:   A->info.mallocs += 0;
4370:   A->info.nz_unneeded = 0;
4371:   A->assembled = A->was_assembled = PETSC_TRUE;
4372:   A->num_ass++;
4373:   PetscFunctionReturn(PETSC_SUCCESS);
4374: }

4376: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4377: {
4378:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

4380:   PetscFunctionBegin;
4381:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4382:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4383:   if (destroy) {
4384:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4385:     delete cusp->csr2csc_i;
4386:     cusp->csr2csc_i = NULL;
4387:   }
4388:   A->transupdated = PETSC_FALSE;
4389:   PetscFunctionReturn(PETSC_SUCCESS);
4390: }

4392: #include <thrust/binary_search.h>
4393: /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4394: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4395: {
4396:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4397:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4398:   PetscInt            cooPerm_n, nzr = 0;

4400:   PetscFunctionBegin;
4401:   PetscCall(PetscLayoutSetUp(A->rmap));
4402:   PetscCall(PetscLayoutSetUp(A->cmap));
4403:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4404:   if (n != cooPerm_n) {
4405:     delete cusp->cooPerm;
4406:     delete cusp->cooPerm_a;
4407:     cusp->cooPerm   = NULL;
4408:     cusp->cooPerm_a = NULL;
4409:   }
4410:   if (n) {
4411:     thrust::device_ptr<PetscInt> d_i, d_j;
4412:     PetscInt                    *d_raw_i, *d_raw_j;
4413:     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4414:     PetscMemType                 imtype, jmtype;

4416:     PetscCall(PetscGetMemType(coo_i, &imtype));
4417:     if (PetscMemTypeHost(imtype)) {
4418:       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4419:       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4420:       d_i        = thrust::device_pointer_cast(d_raw_i);
4421:       free_raw_i = PETSC_TRUE;
4422:       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4423:     } else {
4424:       d_i = thrust::device_pointer_cast(coo_i);
4425:     }

4427:     PetscCall(PetscGetMemType(coo_j, &jmtype));
4428:     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4429:       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4430:       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4431:       d_j        = thrust::device_pointer_cast(d_raw_j);
4432:       free_raw_j = PETSC_TRUE;
4433:       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4434:     } else {
4435:       d_j = thrust::device_pointer_cast(coo_j);
4436:     }

4438:     THRUSTINTARRAY ii(A->rmap->n);

4440:     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4441:     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);

4443:     /* Ex.
4444:       n = 6
4445:       coo_i = [3,3,1,4,1,4]
4446:       coo_j = [3,2,2,5,2,6]
4447:     */
4448:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4449:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));

4451:     PetscCall(PetscLogGpuTimeBegin());
4452:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4453:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4454:     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4455:     THRUSTINTARRAY w(d_j, d_j + n);

4457:     /*
4458:       d_i     = [1,1,3,3,4,4]
4459:       d_j     = [2,2,2,3,5,6]
4460:       cooPerm = [2,4,1,0,3,5]
4461:     */
4462:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */

4464:     /*
4465:       d_i     = [1,3,3,4,4,x]
4466:                             ^ekey
4467:       d_j     = [2,2,3,5,6,x]
4468:                            ^nekye
4469:     */
4470:     if (nekey == ekey) { /* all entries are unique */
4471:       delete cusp->cooPerm_a;
4472:       cusp->cooPerm_a = NULL;
4473:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4474:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4475:       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4476:       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4477:       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4478:       w[0]                  = 0;
4479:       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4480:       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4481:     }
4482:     thrust::counting_iterator<PetscInt> search_begin(0);
4483:     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4484:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4485:                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4486:     PetscCall(PetscLogGpuTimeEnd());

4488:     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4489:     a->singlemalloc = PETSC_FALSE;
4490:     a->free_a       = PETSC_TRUE;
4491:     a->free_ij      = PETSC_TRUE;
4492:     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4493:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4494:     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4495:     a->nz = a->maxnz = a->i[A->rmap->n];
4496:     a->rmax          = 0;
4497:     PetscCall(PetscMalloc1(a->nz, &a->a));
4498:     PetscCall(PetscMalloc1(a->nz, &a->j));
4499:     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4500:     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4501:     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4502:     for (PetscInt i = 0; i < A->rmap->n; i++) {
4503:       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4504:       nzr += (PetscInt) !!(nnzr);
4505:       a->ilen[i] = a->imax[i] = nnzr;
4506:       a->rmax                 = PetscMax(a->rmax, nnzr);
4507:     }
4508:     a->nonzerorowcnt = nzr;
4509:     A->preallocated  = PETSC_TRUE;
4510:     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4511:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4512:     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4513:     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4514:   } else {
4515:     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4516:   }
4517:   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));

4519:   /* We want to allocate the CUSPARSE struct for matvec now.
4520:      The code is so convoluted now that I prefer to copy zeros */
4521:   PetscCall(PetscArrayzero(a->a, a->nz));
4522:   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4523:   A->offloadmask = PETSC_OFFLOAD_CPU;
4524:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4525:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4526:   PetscFunctionReturn(PETSC_SUCCESS);
4527: }

4529: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4530: {
4531:   Mat_SeqAIJ         *seq;
4532:   Mat_SeqAIJCUSPARSE *dev;
4533:   PetscBool           coo_basic = PETSC_TRUE;
4534:   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;

4536:   PetscFunctionBegin;
4537:   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4538:   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4539:   if (coo_i) {
4540:     PetscCall(PetscGetMemType(coo_i, &mtype));
4541:     if (PetscMemTypeHost(mtype)) {
4542:       for (PetscCount k = 0; k < coo_n; k++) {
4543:         if (coo_i[k] < 0 || coo_j[k] < 0) {
4544:           coo_basic = PETSC_FALSE;
4545:           break;
4546:         }
4547:       }
4548:     }
4549:   }

4551:   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4552:     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4553:   } else {
4554:     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4555:     mat->offloadmask = PETSC_OFFLOAD_CPU;
4556:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4557:     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4558:     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4559:     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4560:     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4561:     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4562:     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4563:     dev->use_extended_coo = PETSC_TRUE;
4564:   }
4565:   PetscFunctionReturn(PETSC_SUCCESS);
4566: }

4568: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4569: {
4570:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4571:   const PetscCount grid_size = gridDim.x * blockDim.x;
4572:   for (; i < nnz; i += grid_size) {
4573:     PetscScalar sum = 0.0;
4574:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4575:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4576:   }
4577: }

4579: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4580: {
4581:   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4582:   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4583:   PetscCount          Annz = seq->nz;
4584:   PetscMemType        memtype;
4585:   const PetscScalar  *v1 = v;
4586:   PetscScalar        *Aa;

4588:   PetscFunctionBegin;
4589:   if (dev->use_extended_coo) {
4590:     PetscCall(PetscGetMemType(v, &memtype));
4591:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4592:       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4593:       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4594:     }

4596:     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4597:     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));

4599:     if (Annz) {
4600:       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4601:       PetscCallCUDA(cudaPeekAtLastError());
4602:     }

4604:     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4605:     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));

4607:     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4608:   } else {
4609:     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4610:   }
4611:   PetscFunctionReturn(PETSC_SUCCESS);
4612: }

4614: /*@C
4615:     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

4617:    Not Collective

4619:     Input Parameters:
4620: +   A - the matrix
4621: -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4623:     Output Parameters:
4624: +   i - the CSR row pointers
4625: -   j - the CSR column indices

4627:     Level: developer

4629:     Note:
4630:       When compressed is true, the CSR structure does not contain empty rows

4632: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4633: @*/
4634: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4635: {
4636:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4637:   CsrMatrix          *csr;
4638:   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;

4640:   PetscFunctionBegin;
4642:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4643:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4644:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4645:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4646:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4647:   csr = (CsrMatrix *)cusp->mat->mat;
4648:   if (i) {
4649:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4650:       if (!cusp->rowoffsets_gpu) {
4651:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4652:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4653:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4654:       }
4655:       *i = cusp->rowoffsets_gpu->data().get();
4656:     } else *i = csr->row_offsets->data().get();
4657:   }
4658:   if (j) *j = csr->column_indices->data().get();
4659:   PetscFunctionReturn(PETSC_SUCCESS);
4660: }

4662: /*@C
4663:     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

4665:    Not Collective

4667:     Input Parameters:
4668: +   A - the matrix
4669: .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4670: .   i - the CSR row pointers
4671: -   j - the CSR column indices

4673:     Level: developer

4675: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4676: @*/
4677: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4678: {
4679:   PetscFunctionBegin;
4681:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4682:   if (i) *i = NULL;
4683:   if (j) *j = NULL;
4684:   (void)compressed;
4685:   PetscFunctionReturn(PETSC_SUCCESS);
4686: }

4688: /*@C
4689:    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4691:    Not Collective

4693:    Input Parameter:
4694: .   A - a `MATSEQAIJCUSPARSE` matrix

4696:    Output Parameter:
4697: .   a - pointer to the device data

4699:    Level: developer

4701:    Note:
4702:    May trigger host-device copies if up-to-date matrix data is on host

4704: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4705: @*/
4706: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4707: {
4708:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4709:   CsrMatrix          *csr;

4711:   PetscFunctionBegin;
4714:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4715:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4716:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4717:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4718:   csr = (CsrMatrix *)cusp->mat->mat;
4719:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4720:   *a = csr->values->data().get();
4721:   PetscFunctionReturn(PETSC_SUCCESS);
4722: }

4724: /*@C
4725:    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

4727:    Not Collective

4729:    Input Parameters:
4730: +   A - a `MATSEQAIJCUSPARSE` matrix
4731: -   a - pointer to the device data

4733:    Level: developer

4735: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4736: @*/
4737: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4738: {
4739:   PetscFunctionBegin;
4742:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4743:   *a = NULL;
4744:   PetscFunctionReturn(PETSC_SUCCESS);
4745: }

4747: /*@C
4748:    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4750:    Not Collective

4752:    Input Parameter:
4753: .   A - a `MATSEQAIJCUSPARSE` matrix

4755:    Output Parameter:
4756: .   a - pointer to the device data

4758:    Level: developer

4760:    Note:
4761:    May trigger host-device copies if up-to-date matrix data is on host

4763: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4764: @*/
4765: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4766: {
4767:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4768:   CsrMatrix          *csr;

4770:   PetscFunctionBegin;
4773:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4774:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4775:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4776:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4777:   csr = (CsrMatrix *)cusp->mat->mat;
4778:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4779:   *a             = csr->values->data().get();
4780:   A->offloadmask = PETSC_OFFLOAD_GPU;
4781:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4782:   PetscFunctionReturn(PETSC_SUCCESS);
4783: }
4784: /*@C
4785:    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

4787:    Not Collective

4789:    Input Parameters:
4790: +   A - a `MATSEQAIJCUSPARSE` matrix
4791: -   a - pointer to the device data

4793:    Level: developer

4795: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4796: @*/
4797: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4798: {
4799:   PetscFunctionBegin;
4802:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4803:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4804:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4805:   *a = NULL;
4806:   PetscFunctionReturn(PETSC_SUCCESS);
4807: }

4809: /*@C
4810:    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4812:    Not Collective

4814:    Input Parameter:
4815: .   A - a `MATSEQAIJCUSPARSE` matrix

4817:    Output Parameter:
4818: .   a - pointer to the device data

4820:    Level: developer

4822:    Note:
4823:    Does not trigger host-device copies and flags data validity on the GPU

4825: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4826: @*/
4827: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4828: {
4829:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4830:   CsrMatrix          *csr;

4832:   PetscFunctionBegin;
4835:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4836:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4837:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4838:   csr = (CsrMatrix *)cusp->mat->mat;
4839:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4840:   *a             = csr->values->data().get();
4841:   A->offloadmask = PETSC_OFFLOAD_GPU;
4842:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4843:   PetscFunctionReturn(PETSC_SUCCESS);
4844: }

4846: /*@C
4847:    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

4849:    Not Collective

4851:    Input Parameters:
4852: +   A - a `MATSEQAIJCUSPARSE` matrix
4853: -   a - pointer to the device data

4855:    Level: developer

4857: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4858: @*/
4859: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4860: {
4861:   PetscFunctionBegin;
4864:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4865:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4866:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4867:   *a = NULL;
4868:   PetscFunctionReturn(PETSC_SUCCESS);
4869: }

4871: struct IJCompare4 {
4872:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4873:   {
4874:     if (t1.get<0>() < t2.get<0>()) return true;
4875:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4876:     return false;
4877:   }
4878: };

4880: struct Shift {
4881:   int _shift;

4883:   Shift(int shift) : _shift(shift) { }
4884:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4885: };

4887: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4888: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4889: {
4890:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4891:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4892:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4893:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4894:   PetscInt                      Annz, Bnnz;
4895:   cusparseStatus_t              stat;
4896:   PetscInt                      i, m, n, zero = 0;

4898:   PetscFunctionBegin;
4902:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4903:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4904:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4905:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4906:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4907:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4908:   if (reuse == MAT_INITIAL_MATRIX) {
4909:     m = A->rmap->n;
4910:     n = A->cmap->n + B->cmap->n;
4911:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4912:     PetscCall(MatSetSizes(*C, m, n, m, n));
4913:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4914:     c                       = (Mat_SeqAIJ *)(*C)->data;
4915:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4916:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4917:     Ccsr                    = new CsrMatrix;
4918:     Cmat->cprowIndices      = NULL;
4919:     c->compressedrow.use    = PETSC_FALSE;
4920:     c->compressedrow.nrows  = 0;
4921:     c->compressedrow.i      = NULL;
4922:     c->compressedrow.rindex = NULL;
4923:     Ccusp->workVector       = NULL;
4924:     Ccusp->nrows            = m;
4925:     Ccusp->mat              = Cmat;
4926:     Ccusp->mat->mat         = Ccsr;
4927:     Ccsr->num_rows          = m;
4928:     Ccsr->num_cols          = n;
4929:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4930:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4931:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4932:     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4933:     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4934:     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4935:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4936:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4937:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4938:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4939:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4940:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4941:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

4943:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4944:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4945:     Annz                 = (PetscInt)Acsr->column_indices->size();
4946:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4947:     c->nz                = Annz + Bnnz;
4948:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4949:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4950:     Ccsr->values         = new THRUSTARRAY(c->nz);
4951:     Ccsr->num_entries    = c->nz;
4952:     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4953:     if (c->nz) {
4954:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4955:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4956:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4957:       THRUSTINTARRAY32 *Aroff, *Broff;

4959:       if (a->compressedrow.use) { /* need full row offset */
4960:         if (!Acusp->rowoffsets_gpu) {
4961:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4962:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4963:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4964:         }
4965:         Aroff = Acusp->rowoffsets_gpu;
4966:       } else Aroff = Acsr->row_offsets;
4967:       if (b->compressedrow.use) { /* need full row offset */
4968:         if (!Bcusp->rowoffsets_gpu) {
4969:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4970:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4971:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4972:         }
4973:         Broff = Bcusp->rowoffsets_gpu;
4974:       } else Broff = Bcsr->row_offsets;
4975:       PetscCall(PetscLogGpuTimeBegin());
4976:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4977:       PetscCallCUSPARSE(stat);
4978:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4979:       PetscCallCUSPARSE(stat);
4980:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4981:       auto Aperm = thrust::make_constant_iterator(1);
4982:       auto Bperm = thrust::make_constant_iterator(0);
4983: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4984:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4985:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4986: #else
4987:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4988:       auto Bcib = Bcsr->column_indices->begin();
4989:       auto Bcie = Bcsr->column_indices->end();
4990:       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4991: #endif
4992:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4993:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4994:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4995:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4996:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4997:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4998:       auto p1    = Ccusp->cooPerm->begin();
4999:       auto p2    = Ccusp->cooPerm->begin();
5000:       thrust::advance(p2, Annz);
5001:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
5002: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
5003:       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
5004: #endif
5005:       auto cci = thrust::make_counting_iterator(zero);
5006:       auto cce = thrust::make_counting_iterator(c->nz);
5007: #if 0 //Errors on SUMMIT cuda 11.1.0
5008:       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5009: #else
5010:       auto pred = thrust::identity<int>();
5011:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
5012:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
5013: #endif
5014:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
5015:       PetscCallCUSPARSE(stat);
5016:       PetscCall(PetscLogGpuTimeEnd());
5017:       delete wPerm;
5018:       delete Acoo;
5019:       delete Bcoo;
5020:       delete Ccoo;
5021: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5022:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5023:       PetscCallCUSPARSE(stat);
5024: #endif
5025:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5026:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5027:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5028:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5029:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5030:         CsrMatrix                    *CcsrT = new CsrMatrix;
5031:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5032:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

5034:         (*C)->form_explicit_transpose = PETSC_TRUE;
5035:         (*C)->transupdated            = PETSC_TRUE;
5036:         Ccusp->rowoffsets_gpu         = NULL;
5037:         CmatT->cprowIndices           = NULL;
5038:         CmatT->mat                    = CcsrT;
5039:         CcsrT->num_rows               = n;
5040:         CcsrT->num_cols               = m;
5041:         CcsrT->num_entries            = c->nz;

5043:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
5044:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5045:         CcsrT->values         = new THRUSTARRAY(c->nz);

5047:         PetscCall(PetscLogGpuTimeBegin());
5048:         auto rT = CcsrT->row_offsets->begin();
5049:         if (AT) {
5050:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
5051:           thrust::advance(rT, -1);
5052:         }
5053:         if (BT) {
5054:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
5055:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
5056:           thrust::copy(titb, tite, rT);
5057:         }
5058:         auto cT = CcsrT->column_indices->begin();
5059:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
5060:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
5061:         auto vT = CcsrT->values->begin();
5062:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5063:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5064:         PetscCall(PetscLogGpuTimeEnd());

5066:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5067:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5068:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5069:         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
5070:         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
5071:         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5072:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5073:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5074:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5075: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
5076:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
5077:         PetscCallCUSPARSE(stat);
5078: #endif
5079:         Ccusp->matTranspose = CmatT;
5080:       }
5081:     }

5083:     c->singlemalloc = PETSC_FALSE;
5084:     c->free_a       = PETSC_TRUE;
5085:     c->free_ij      = PETSC_TRUE;
5086:     PetscCall(PetscMalloc1(m + 1, &c->i));
5087:     PetscCall(PetscMalloc1(c->nz, &c->j));
5088:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
5089:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5090:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5091:       ii = *Ccsr->row_offsets;
5092:       jj = *Ccsr->column_indices;
5093:       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5094:       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5095:     } else {
5096:       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5097:       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5098:     }
5099:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
5100:     PetscCall(PetscMalloc1(m, &c->ilen));
5101:     PetscCall(PetscMalloc1(m, &c->imax));
5102:     c->maxnz         = c->nz;
5103:     c->nonzerorowcnt = 0;
5104:     c->rmax          = 0;
5105:     for (i = 0; i < m; i++) {
5106:       const PetscInt nn = c->i[i + 1] - c->i[i];
5107:       c->ilen[i] = c->imax[i] = nn;
5108:       c->nonzerorowcnt += (PetscInt) !!nn;
5109:       c->rmax = PetscMax(c->rmax, nn);
5110:     }
5111:     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5112:     PetscCall(PetscMalloc1(c->nz, &c->a));
5113:     (*C)->nonzerostate++;
5114:     PetscCall(PetscLayoutSetUp((*C)->rmap));
5115:     PetscCall(PetscLayoutSetUp((*C)->cmap));
5116:     Ccusp->nonzerostate = (*C)->nonzerostate;
5117:     (*C)->preallocated  = PETSC_TRUE;
5118:   } else {
5119:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
5120:     c = (Mat_SeqAIJ *)(*C)->data;
5121:     if (c->nz) {
5122:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
5123:       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
5124:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
5125:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
5126:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5127:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5128:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5129:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5130:       Acsr = (CsrMatrix *)Acusp->mat->mat;
5131:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
5132:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
5133:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
5134:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
5135:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
5136:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
5137:       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
5138:       auto pmid = Ccusp->cooPerm->begin();
5139:       thrust::advance(pmid, Acsr->num_entries);
5140:       PetscCall(PetscLogGpuTimeBegin());
5141:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
5142:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5143:       thrust::for_each(zibait, zieait, VecCUDAEquals());
5144:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5145:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
5146:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5147:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5148:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5149:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5150:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5151:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5152:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5153:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5154:         auto       vT    = CcsrT->values->begin();
5155:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5156:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5157:         (*C)->transupdated = PETSC_TRUE;
5158:       }
5159:       PetscCall(PetscLogGpuTimeEnd());
5160:     }
5161:   }
5162:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5163:   (*C)->assembled     = PETSC_TRUE;
5164:   (*C)->was_assembled = PETSC_FALSE;
5165:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5166:   PetscFunctionReturn(PETSC_SUCCESS);
5167: }

5169: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5170: {
5171:   bool               dmem;
5172:   const PetscScalar *av;

5174:   PetscFunctionBegin;
5175:   dmem = isCudaMem(v);
5176:   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5177:   if (n && idx) {
5178:     THRUSTINTARRAY widx(n);
5179:     widx.assign(idx, idx + n);
5180:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

5182:     THRUSTARRAY                    *w = NULL;
5183:     thrust::device_ptr<PetscScalar> dv;
5184:     if (dmem) {
5185:       dv = thrust::device_pointer_cast(v);
5186:     } else {
5187:       w  = new THRUSTARRAY(n);
5188:       dv = w->data();
5189:     }
5190:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

5192:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5193:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5194:     thrust::for_each(zibit, zieit, VecCUDAEquals());
5195:     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5196:     delete w;
5197:   } else {
5198:     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5199:   }
5200:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5201:   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5202:   PetscFunctionReturn(PETSC_SUCCESS);
5203: }