Actual source code: ex5cu.cu
1: static char help[] = "Test of CUDA matrix assemble with simple matrix.\n\n";
3: // This a minimal example of the use of the CUDA MatAIJ metadata for assembly.
4: //
5: // The matrix must be a type 'aijcusparse' and must first be assembled on the CPU to provide the nonzero pattern.
6: // Next, get a pointer to a simple CSR mirror (PetscSplitCSRDataStructure) of the matrix data on
7: // the GPU with MatCUSPARSEGetDeviceMatWrite().
8: // Then use this object to populate the matrix on the GPU with MatSetValuesDevice().
9: // Finally call MatAssemblyBegin/End() and the matrix is ready to use on the GPU without matrix data movement between the
10: // host and GPU.
12: #include <petscconf.h>
13: #include <petscmat.h>
14: #include <petscdevice_cuda.h>
15: #include <assert.h>
17: #include <petscaijdevice.h>
19: __global__ void assemble_on_gpu(PetscSplitCSRDataStructure d_mat, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
20: {
21: const PetscInt inc = blockDim.x, my0 = threadIdx.x;
22: PetscInt i;
23: int err;
25: for (i = start + my0; i < end + 1; i += inc) {
26: PetscInt js[] = {i - 1, i}, nn = (i == N) ? 1 : 2; // negative indices are ignored but >= N are not, so clip end
27: PetscScalar values[] = {1, 1, 1, 1};
28: err = MatSetValuesDevice(d_mat, nn, js, nn, js, values, ADD_VALUES);
29: if (err) assert(0);
30: }
31: }
33: PetscErrorCode assemble_on_cpu(Mat A, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt)
34: {
35: PetscFunctionBeginUser;
36: for (PetscInt i = start; i < end + 1; i++) {
37: PetscInt js[] = {i - 1, i}, nn = (i == N) ? 1 : 2;
38: PetscScalar values[] = {1, 1, 1, 1};
39: PetscCall(MatSetValues(A, nn, js, nn, js, values, ADD_VALUES));
40: }
41: PetscFunctionReturn(PETSC_SUCCESS);
42: }
44: int main(int argc, char **args)
45: {
46: Mat A;
47: PetscInt N = 11, nz = 3, Istart, Iend, num_threads = 128;
48: PetscSplitCSRDataStructure d_mat;
49: PetscLogEvent event;
50: PetscMPIInt rank, size;
51: PetscBool testmpiseq = PETSC_FALSE;
52: Vec x, y;
54: PetscFunctionBeginUser;
55: PetscCall(PetscInitialize(&argc, &args, (char *)0, help));
56: PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &N, NULL));
57: PetscCall(PetscOptionsGetInt(NULL, NULL, "-num_threads", &num_threads, NULL));
58: PetscCall(PetscOptionsGetInt(NULL, NULL, "-nz_row", &nz, NULL));
59: PetscCall(PetscOptionsGetBool(NULL, NULL, "-testmpiseq", &testmpiseq, NULL));
60: if (nz < 3) nz = 3;
61: if (nz > N + 1) nz = N + 1;
62: PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
63: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
65: PetscCall(PetscLogEventRegister("GPU operator", MAT_CLASSID, &event));
66: PetscCall(MatCreateAIJCUSPARSE(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, N, N, nz, NULL, nz - 1, NULL, &A));
67: PetscCall(MatSetFromOptions(A));
68: PetscCall(MatSetOption(A, MAT_IGNORE_OFF_PROC_ENTRIES, PETSC_TRUE));
69: PetscCall(MatCreateVecs(A, &x, &y));
70: PetscCall(MatGetOwnershipRange(A, &Istart, &Iend));
71: /* current GPU assembly code does not support offprocessor values insertion */
72: PetscCall(assemble_on_cpu(A, Istart, Iend, N, rank));
73: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
74: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
76: // test
77: PetscCall(VecSet(x, 1.0));
78: PetscCall(MatMult(A, x, y));
79: PetscCall(VecViewFromOptions(y, NULL, "-ex5_vec_view"));
81: if (testmpiseq && size == 1) {
82: PetscCall(MatConvert(A, MATSEQAIJ, MAT_INPLACE_MATRIX, &A));
83: PetscCall(MatConvert(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
84: }
85: PetscCall(PetscLogEventBegin(event, 0, 0, 0, 0));
86: PetscCall(MatCUSPARSEGetDeviceMatWrite(A, &d_mat));
87: assemble_on_gpu<<<1, num_threads>>>(d_mat, Istart, Iend, N, rank);
88: PetscCallCUDA(cudaDeviceSynchronize());
89: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
90: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
91: PetscCall(PetscLogEventEnd(event, 0, 0, 0, 0));
93: // test
94: PetscCall(VecSet(x, 1.0));
95: PetscCall(MatMult(A, x, y));
96: PetscCall(VecViewFromOptions(y, NULL, "-ex5_vec_view"));
98: PetscCall(MatDestroy(&A));
99: PetscCall(VecDestroy(&x));
100: PetscCall(VecDestroy(&y));
101: PetscCall(PetscFinalize());
102: return 0;
103: }
105: /*TEST
107: build:
108: requires: cuda !defined(PETSC_HAVE_CUDA_CLANG)
110: test:
111: suffix: 0
112: diff_args: -j
113: args: -n 11 -ex5_vec_view
114: nsize: 1
116: test:
117: suffix: 1
118: diff_args: -j
119: args: -n 11 -ex5_vec_view
120: nsize: 2
122: test:
123: suffix: 2
124: diff_args: -j
125: args: -n 11 -testmpiseq -ex5_vec_view
126: nsize: 1
128: TEST*/