Actual source code: ex5cu.cu

  1: static char help[] = "Test of CUDA matrix assemble with simple matrix.\n\n";

  3: // This a minimal example of the use of the CUDA MatAIJ metadata for assembly.
  4: //
  5: // The matrix must be a type 'aijcusparse' and must first be assembled on the CPU to provide the nonzero pattern.
  6: // Next, get a pointer to a simple CSR mirror (PetscSplitCSRDataStructure) of the matrix data on
  7: //    the GPU with MatCUSPARSEGetDeviceMatWrite().
  8: // Then use this object to populate the matrix on the GPU with MatSetValuesDevice().
  9: // Finally call MatAssemblyBegin/End() and the matrix is ready to use on the GPU without matrix data movement between the
 10: //    host and GPU.

 12: #include <petscconf.h>
 13: #include <petscmat.h>
 14: #include <petscdevice_cuda.h>
 15: #include <assert.h>

 17: #include <petscaijdevice.h>

 19: __global__ void assemble_on_gpu(PetscSplitCSRDataStructure d_mat, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
 20: {
 21:   const PetscInt inc = blockDim.x, my0 = threadIdx.x;
 22:   PetscInt       i;
 23:   int            err;

 25:   for (i = start + my0; i < end + 1; i += inc) {
 26:     PetscInt    js[] = {i - 1, i}, nn = (i == N) ? 1 : 2; // negative indices are ignored but >= N are not, so clip end
 27:     PetscScalar values[] = {1, 1, 1, 1};
 28:     err                  = MatSetValuesDevice(d_mat, nn, js, nn, js, values, ADD_VALUES);
 29:     if (err) assert(0);
 30:   }
 31: }

 33: PetscErrorCode assemble_on_cpu(Mat A, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt)
 34: {
 35:   PetscFunctionBeginUser;
 36:   for (PetscInt i = start; i < end + 1; i++) {
 37:     PetscInt    js[] = {i - 1, i}, nn = (i == N) ? 1 : 2;
 38:     PetscScalar values[] = {1, 1, 1, 1};
 39:     PetscCall(MatSetValues(A, nn, js, nn, js, values, ADD_VALUES));
 40:   }
 41:   PetscFunctionReturn(PETSC_SUCCESS);
 42: }

 44: int main(int argc, char **args)
 45: {
 46:   Mat                        A;
 47:   PetscInt                   N = 11, nz = 3, Istart, Iend, num_threads = 128;
 48:   PetscSplitCSRDataStructure d_mat;
 49:   PetscLogEvent              event;
 50:   PetscMPIInt                rank, size;
 51:   PetscBool                  testmpiseq = PETSC_FALSE;
 52:   Vec                        x, y;

 54:   PetscFunctionBeginUser;
 55:   PetscCall(PetscInitialize(&argc, &args, (char *)0, help));
 56:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &N, NULL));
 57:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-num_threads", &num_threads, NULL));
 58:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-nz_row", &nz, NULL));
 59:   PetscCall(PetscOptionsGetBool(NULL, NULL, "-testmpiseq", &testmpiseq, NULL));
 60:   if (nz < 3) nz = 3;
 61:   if (nz > N + 1) nz = N + 1;
 62:   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
 63:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));

 65:   PetscCall(PetscLogEventRegister("GPU operator", MAT_CLASSID, &event));
 66:   PetscCall(MatCreateAIJCUSPARSE(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, N, N, nz, NULL, nz - 1, NULL, &A));
 67:   PetscCall(MatSetFromOptions(A));
 68:   PetscCall(MatSetOption(A, MAT_IGNORE_OFF_PROC_ENTRIES, PETSC_TRUE));
 69:   PetscCall(MatCreateVecs(A, &x, &y));
 70:   PetscCall(MatGetOwnershipRange(A, &Istart, &Iend));
 71:   /* current GPU assembly code does not support offprocessor values insertion */
 72:   PetscCall(assemble_on_cpu(A, Istart, Iend, N, rank));
 73:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
 74:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));

 76:   // test
 77:   PetscCall(VecSet(x, 1.0));
 78:   PetscCall(MatMult(A, x, y));
 79:   PetscCall(VecViewFromOptions(y, NULL, "-ex5_vec_view"));

 81:   if (testmpiseq && size == 1) {
 82:     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INPLACE_MATRIX, &A));
 83:     PetscCall(MatConvert(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
 84:   }
 85:   PetscCall(PetscLogEventBegin(event, 0, 0, 0, 0));
 86:   PetscCall(MatCUSPARSEGetDeviceMatWrite(A, &d_mat));
 87:   assemble_on_gpu<<<1, num_threads>>>(d_mat, Istart, Iend, N, rank);
 88:   PetscCallCUDA(cudaDeviceSynchronize());
 89:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
 90:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
 91:   PetscCall(PetscLogEventEnd(event, 0, 0, 0, 0));

 93:   // test
 94:   PetscCall(VecSet(x, 1.0));
 95:   PetscCall(MatMult(A, x, y));
 96:   PetscCall(VecViewFromOptions(y, NULL, "-ex5_vec_view"));

 98:   PetscCall(MatDestroy(&A));
 99:   PetscCall(VecDestroy(&x));
100:   PetscCall(VecDestroy(&y));
101:   PetscCall(PetscFinalize());
102:   return 0;
103: }

105: /*TEST

107:    build:
108:       requires: cuda !defined(PETSC_HAVE_CUDA_CLANG)

110:    test:
111:       suffix: 0
112:       diff_args: -j
113:       args: -n 11 -ex5_vec_view
114:       nsize: 1

116:    test:
117:       suffix: 1
118:       diff_args: -j
119:       args: -n 11 -ex5_vec_view
120:       nsize: 2

122:    test:
123:       suffix: 2
124:       diff_args: -j
125:       args: -n 11 -testmpiseq -ex5_vec_view
126:       nsize: 1

128: TEST*/