Actual source code: sfcuda.cu
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
4: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid)
5: {
6: PetscInt i, j, k, m, n, r;
7: const PetscInt *offset, *start, *dx, *dy, *X, *Y;
9: n = opt[0];
10: offset = opt + 1;
11: start = opt + n + 2;
12: dx = opt + 2 * n + 2;
13: dy = opt + 3 * n + 2;
14: X = opt + 5 * n + 2;
15: Y = opt + 6 * n + 2;
16: for (r = 0; r < n; r++) {
17: if (tid < offset[r + 1]) break;
18: }
19: m = (tid - offset[r]);
20: k = m / (dx[r] * dy[r]);
21: j = (m - k * dx[r] * dy[r]) / dx[r];
22: i = m - k * dx[r] * dy[r] - j * dx[r];
24: return (start[r] + k * X[r] * Y[r] + j * X[r] + i);
25: }
27: /*====================================================================================*/
28: /* Templated CUDA kernels for pack/unpack. The Op can be regular or atomic */
29: /*====================================================================================*/
31: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
32: <Type> is PetscReal, which is the primitive type we operate on.
33: <bs> is 16, which says <unit> contains 16 primitive types.
34: <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
35: <EQ> is 0, which is (bs == BS ? 1 : 0)
37: If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
38: For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
39: */
40: template <class Type, PetscInt BS, PetscInt EQ>
41: __global__ static void d_Pack(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, const Type *data, Type *buf)
42: {
43: PetscInt i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
44: const PetscInt grid_size = gridDim.x * blockDim.x;
45: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */
46: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
48: for (; tid < count; tid += grid_size) {
49: /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
50: opt == NULL && idx == NULL ==> the indices are contiguous;
51: */
52: t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
53: s = tid * MBS;
54: for (i = 0; i < MBS; i++) buf[s + i] = data[t + i];
55: }
56: }
58: template <class Type, class Op, PetscInt BS, PetscInt EQ>
59: __global__ static void d_UnpackAndOp(PetscInt bs, PetscInt count, PetscInt start, const PetscInt *opt, const PetscInt *idx, Type *data, const Type *buf)
60: {
61: PetscInt i, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
62: const PetscInt grid_size = gridDim.x * blockDim.x;
63: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
64: Op op;
66: for (; tid < count; tid += grid_size) {
67: t = (opt ? MapTidToIndex(opt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
68: s = tid * MBS;
69: for (i = 0; i < MBS; i++) op(data[t + i], buf[s + i]);
70: }
71: }
73: template <class Type, class Op, PetscInt BS, PetscInt EQ>
74: __global__ static void d_FetchAndOp(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, Type *leafbuf)
75: {
76: PetscInt i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
77: const PetscInt grid_size = gridDim.x * blockDim.x;
78: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
79: Op op;
81: for (; tid < count; tid += grid_size) {
82: r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
83: l = tid * MBS;
84: for (i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]);
85: }
86: }
88: template <class Type, class Op, PetscInt BS, PetscInt EQ>
89: __global__ static void d_ScatterAndOp(PetscInt bs, PetscInt count, PetscInt srcx, PetscInt srcy, PetscInt srcX, PetscInt srcY, PetscInt srcStart, const PetscInt *srcIdx, const Type *src, PetscInt dstx, PetscInt dsty, PetscInt dstX, PetscInt dstY, PetscInt dstStart, const PetscInt *dstIdx, Type *dst)
90: {
91: PetscInt i, j, k, s, t, tid = blockIdx.x * blockDim.x + threadIdx.x;
92: const PetscInt grid_size = gridDim.x * blockDim.x;
93: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
94: Op op;
96: for (; tid < count; tid += grid_size) {
97: if (!srcIdx) { /* src is either contiguous or 3D */
98: k = tid / (srcx * srcy);
99: j = (tid - k * srcx * srcy) / srcx;
100: i = tid - k * srcx * srcy - j * srcx;
101: s = srcStart + k * srcX * srcY + j * srcX + i;
102: } else {
103: s = srcIdx[tid];
104: }
106: if (!dstIdx) { /* dst is either contiguous or 3D */
107: k = tid / (dstx * dsty);
108: j = (tid - k * dstx * dsty) / dstx;
109: i = tid - k * dstx * dsty - j * dstx;
110: t = dstStart + k * dstX * dstY + j * dstX + i;
111: } else {
112: t = dstIdx[tid];
113: }
115: s *= MBS;
116: t *= MBS;
117: for (i = 0; i < MBS; i++) op(dst[t + i], src[s + i]);
118: }
119: }
121: template <class Type, class Op, PetscInt BS, PetscInt EQ>
122: __global__ static void d_FetchAndOpLocal(PetscInt bs, PetscInt count, PetscInt rootstart, const PetscInt *rootopt, const PetscInt *rootidx, Type *rootdata, PetscInt leafstart, const PetscInt *leafopt, const PetscInt *leafidx, const Type *leafdata, Type *leafupdate)
123: {
124: PetscInt i, l, r, tid = blockIdx.x * blockDim.x + threadIdx.x;
125: const PetscInt grid_size = gridDim.x * blockDim.x;
126: const PetscInt M = (EQ) ? 1 : bs / BS, MBS = M * BS;
127: Op op;
129: for (; tid < count; tid += grid_size) {
130: r = (rootopt ? MapTidToIndex(rootopt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
131: l = (leafopt ? MapTidToIndex(leafopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS;
132: for (i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]);
133: }
134: }
136: /*====================================================================================*/
137: /* Regular operations on device */
138: /*====================================================================================*/
139: template <typename Type>
140: struct Insert {
141: __device__ Type operator()(Type &x, Type y) const
142: {
143: Type old = x;
144: x = y;
145: return old;
146: }
147: };
148: template <typename Type>
149: struct Add {
150: __device__ Type operator()(Type &x, Type y) const
151: {
152: Type old = x;
153: x += y;
154: return old;
155: }
156: };
157: template <typename Type>
158: struct Mult {
159: __device__ Type operator()(Type &x, Type y) const
160: {
161: Type old = x;
162: x *= y;
163: return old;
164: }
165: };
166: template <typename Type>
167: struct Min {
168: __device__ Type operator()(Type &x, Type y) const
169: {
170: Type old = x;
171: x = PetscMin(x, y);
172: return old;
173: }
174: };
175: template <typename Type>
176: struct Max {
177: __device__ Type operator()(Type &x, Type y) const
178: {
179: Type old = x;
180: x = PetscMax(x, y);
181: return old;
182: }
183: };
184: template <typename Type>
185: struct LAND {
186: __device__ Type operator()(Type &x, Type y) const
187: {
188: Type old = x;
189: x = x && y;
190: return old;
191: }
192: };
193: template <typename Type>
194: struct LOR {
195: __device__ Type operator()(Type &x, Type y) const
196: {
197: Type old = x;
198: x = x || y;
199: return old;
200: }
201: };
202: template <typename Type>
203: struct LXOR {
204: __device__ Type operator()(Type &x, Type y) const
205: {
206: Type old = x;
207: x = !x != !y;
208: return old;
209: }
210: };
211: template <typename Type>
212: struct BAND {
213: __device__ Type operator()(Type &x, Type y) const
214: {
215: Type old = x;
216: x = x & y;
217: return old;
218: }
219: };
220: template <typename Type>
221: struct BOR {
222: __device__ Type operator()(Type &x, Type y) const
223: {
224: Type old = x;
225: x = x | y;
226: return old;
227: }
228: };
229: template <typename Type>
230: struct BXOR {
231: __device__ Type operator()(Type &x, Type y) const
232: {
233: Type old = x;
234: x = x ^ y;
235: return old;
236: }
237: };
238: template <typename Type>
239: struct Minloc {
240: __device__ Type operator()(Type &x, Type y) const
241: {
242: Type old = x;
243: if (y.a < x.a) x = y;
244: else if (y.a == x.a) x.b = min(x.b, y.b);
245: return old;
246: }
247: };
248: template <typename Type>
249: struct Maxloc {
250: __device__ Type operator()(Type &x, Type y) const
251: {
252: Type old = x;
253: if (y.a > x.a) x = y;
254: else if (y.a == x.a) x.b = min(x.b, y.b); /* See MPI MAXLOC */
255: return old;
256: }
257: };
259: /*====================================================================================*/
260: /* Atomic operations on device */
261: /*====================================================================================*/
263: /*
264: Atomic Insert (exchange) operations
266: CUDA C Programming Guide V10.1 Chapter B.12.1.3:
268: int atomicExch(int* address, int val);
269: unsigned int atomicExch(unsigned int* address, unsigned int val);
270: unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
271: float atomicExch(float* address, float val);
273: reads the 32-bit or 64-bit word old located at the address address in global or shared
274: memory and stores val back to memory at the same address. These two operations are
275: performed in one atomic transaction. The function returns old.
277: PETSc notes:
279: It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.
281: VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
282: atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
283: be atomic itself.
285: With bs>1 and a unit > 64-bits, the current element-wise atomic approach can not guarantee the whole
286: insertion is atomic. Hope no user codes rely on that.
287: */
288: __device__ static double atomicExch(double *address, double val)
289: {
290: return __longlong_as_double(atomicExch((ullint *)address, __double_as_longlong(val)));
291: }
293: __device__ static llint atomicExch(llint *address, llint val)
294: {
295: return (llint)(atomicExch((ullint *)address, (ullint)val));
296: }
298: template <typename Type>
299: struct AtomicInsert {
300: __device__ Type operator()(Type &x, Type y) const { return atomicExch(&x, y); }
301: };
303: #if defined(PETSC_HAVE_COMPLEX)
304: #if defined(PETSC_USE_REAL_DOUBLE)
305: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
306: template <>
307: struct AtomicInsert<PetscComplex> {
308: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
309: {
310: PetscComplex old, *z = &old;
311: double *xp = (double *)&x, *yp = (double *)&y;
312: AtomicInsert<double> op;
313: z[0] = op(xp[0], yp[0]);
314: z[1] = op(xp[1], yp[1]);
315: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
316: }
317: };
318: #elif defined(PETSC_USE_REAL_SINGLE)
319: template <>
320: struct AtomicInsert<PetscComplex> {
321: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
322: {
323: double *xp = (double *)&x, *yp = (double *)&y;
324: AtomicInsert<double> op;
325: return op(xp[0], yp[0]);
326: }
327: };
328: #endif
329: #endif
331: /*
332: Atomic add operations
334: CUDA C Programming Guide V10.1 Chapter B.12.1.1:
336: int atomicAdd(int* address, int val);
337: unsigned int atomicAdd(unsigned int* address,unsigned int val);
338: unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
339: float atomicAdd(float* address, float val);
340: double atomicAdd(double* address, double val);
341: __half2 atomicAdd(__half2 *address, __half2 val);
342: __half atomicAdd(__half *address, __half val);
344: reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
345: and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
346: function returns old.
348: The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
349: The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
350: The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
351: higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
352: the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
353: The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
354: */
355: __device__ static llint atomicAdd(llint *address, llint val)
356: {
357: return (llint)atomicAdd((ullint *)address, (ullint)val);
358: }
360: template <typename Type>
361: struct AtomicAdd {
362: __device__ Type operator()(Type &x, Type y) const { return atomicAdd(&x, y); }
363: };
365: template <>
366: struct AtomicAdd<double> {
367: __device__ double operator()(double &x, double y) const
368: {
369: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
370: return atomicAdd(&x, y);
371: #else
372: double *address = &x, val = y;
373: ullint *address_as_ull = (ullint *)address;
374: ullint old = *address_as_ull, assumed;
375: do {
376: assumed = old;
377: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
378: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
379: } while (assumed != old);
380: return __longlong_as_double(old);
381: #endif
382: }
383: };
385: template <>
386: struct AtomicAdd<float> {
387: __device__ float operator()(float &x, float y) const
388: {
389: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
390: return atomicAdd(&x, y);
391: #else
392: float *address = &x, val = y;
393: int *address_as_int = (int *)address;
394: int old = *address_as_int, assumed;
395: do {
396: assumed = old;
397: old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
398: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
399: } while (assumed != old);
400: return __int_as_float(old);
401: #endif
402: }
403: };
405: #if defined(PETSC_HAVE_COMPLEX)
406: template <>
407: struct AtomicAdd<PetscComplex> {
408: __device__ PetscComplex operator()(PetscComplex &x, PetscComplex y) const
409: {
410: PetscComplex old, *z = &old;
411: PetscReal *xp = (PetscReal *)&x, *yp = (PetscReal *)&y;
412: AtomicAdd<PetscReal> op;
413: z[0] = op(xp[0], yp[0]);
414: z[1] = op(xp[1], yp[1]);
415: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
416: }
417: };
418: #endif
420: /*
421: Atomic Mult operations:
423: CUDA has no atomicMult at all, so we build our own with atomicCAS
424: */
425: #if defined(PETSC_USE_REAL_DOUBLE)
426: __device__ static double atomicMult(double *address, double val)
427: {
428: ullint *address_as_ull = (ullint *)(address);
429: ullint old = *address_as_ull, assumed;
430: do {
431: assumed = old;
432: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
433: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * __longlong_as_double(assumed)));
434: } while (assumed != old);
435: return __longlong_as_double(old);
436: }
437: #elif defined(PETSC_USE_REAL_SINGLE)
438: __device__ static float atomicMult(float *address, float val)
439: {
440: int *address_as_int = (int *)(address);
441: int old = *address_as_int, assumed;
442: do {
443: assumed = old;
444: old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
445: } while (assumed != old);
446: return __int_as_float(old);
447: }
448: #endif
450: __device__ static int atomicMult(int *address, int val)
451: {
452: int *address_as_int = (int *)(address);
453: int old = *address_as_int, assumed;
454: do {
455: assumed = old;
456: old = atomicCAS(address_as_int, assumed, val * assumed);
457: } while (assumed != old);
458: return (int)old;
459: }
461: __device__ static llint atomicMult(llint *address, llint val)
462: {
463: ullint *address_as_ull = (ullint *)(address);
464: ullint old = *address_as_ull, assumed;
465: do {
466: assumed = old;
467: old = atomicCAS(address_as_ull, assumed, (ullint)(val * (llint)assumed));
468: } while (assumed != old);
469: return (llint)old;
470: }
472: template <typename Type>
473: struct AtomicMult {
474: __device__ Type operator()(Type &x, Type y) const { return atomicMult(&x, y); }
475: };
477: /*
478: Atomic Min/Max operations
480: CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
482: int atomicMin(int* address, int val);
483: unsigned int atomicMin(unsigned int* address,unsigned int val);
484: unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
486: reads the 32-bit or 64-bit word old located at the address address in global or shared
487: memory, computes the minimum of old and val, and stores the result back to memory
488: at the same address. These three operations are performed in one atomic transaction.
489: The function returns old.
490: The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
492: atomicMax() is similar.
493: */
495: #if defined(PETSC_USE_REAL_DOUBLE)
496: __device__ static double atomicMin(double *address, double val)
497: {
498: ullint *address_as_ull = (ullint *)(address);
499: ullint old = *address_as_ull, assumed;
500: do {
501: assumed = old;
502: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val, __longlong_as_double(assumed))));
503: } while (assumed != old);
504: return __longlong_as_double(old);
505: }
507: __device__ static double atomicMax(double *address, double val)
508: {
509: ullint *address_as_ull = (ullint *)(address);
510: ullint old = *address_as_ull, assumed;
511: do {
512: assumed = old;
513: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val, __longlong_as_double(assumed))));
514: } while (assumed != old);
515: return __longlong_as_double(old);
516: }
517: #elif defined(PETSC_USE_REAL_SINGLE)
518: __device__ static float atomicMin(float *address, float val)
519: {
520: int *address_as_int = (int *)(address);
521: int old = *address_as_int, assumed;
522: do {
523: assumed = old;
524: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val, __int_as_float(assumed))));
525: } while (assumed != old);
526: return __int_as_float(old);
527: }
529: __device__ static float atomicMax(float *address, float val)
530: {
531: int *address_as_int = (int *)(address);
532: int old = *address_as_int, assumed;
533: do {
534: assumed = old;
535: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val, __int_as_float(assumed))));
536: } while (assumed != old);
537: return __int_as_float(old);
538: }
539: #endif
541: /*
542: atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
543: atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
544: This causes compilation errors with pgi compilers and 64-bit indices:
545: error: function "atomicMin(long long *, long long)" has already been defined
547: So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
548: */
549: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
550: __device__ static llint atomicMin(llint *address, llint val)
551: {
552: ullint *address_as_ull = (ullint *)(address);
553: ullint old = *address_as_ull, assumed;
554: do {
555: assumed = old;
556: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val, (llint)assumed)));
557: } while (assumed != old);
558: return (llint)old;
559: }
561: __device__ static llint atomicMax(llint *address, llint val)
562: {
563: ullint *address_as_ull = (ullint *)(address);
564: ullint old = *address_as_ull, assumed;
565: do {
566: assumed = old;
567: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val, (llint)assumed)));
568: } while (assumed != old);
569: return (llint)old;
570: }
571: #endif
573: template <typename Type>
574: struct AtomicMin {
575: __device__ Type operator()(Type &x, Type y) const { return atomicMin(&x, y); }
576: };
577: template <typename Type>
578: struct AtomicMax {
579: __device__ Type operator()(Type &x, Type y) const { return atomicMax(&x, y); }
580: };
582: /*
583: Atomic bitwise operations
585: CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
587: int atomicAnd(int* address, int val);
588: unsigned int atomicAnd(unsigned int* address,unsigned int val);
589: unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
591: reads the 32-bit or 64-bit word old located at the address address in global or shared
592: memory, computes (old & val), and stores the result back to memory at the same
593: address. These three operations are performed in one atomic transaction.
594: The function returns old.
596: The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
598: atomicOr() and atomicXor are similar.
599: */
601: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
602: __device__ static llint atomicAnd(llint *address, llint val)
603: {
604: ullint *address_as_ull = (ullint *)(address);
605: ullint old = *address_as_ull, assumed;
606: do {
607: assumed = old;
608: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
609: } while (assumed != old);
610: return (llint)old;
611: }
612: __device__ static llint atomicOr(llint *address, llint val)
613: {
614: ullint *address_as_ull = (ullint *)(address);
615: ullint old = *address_as_ull, assumed;
616: do {
617: assumed = old;
618: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
619: } while (assumed != old);
620: return (llint)old;
621: }
623: __device__ static llint atomicXor(llint *address, llint val)
624: {
625: ullint *address_as_ull = (ullint *)(address);
626: ullint old = *address_as_ull, assumed;
627: do {
628: assumed = old;
629: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
630: } while (assumed != old);
631: return (llint)old;
632: }
633: #endif
635: template <typename Type>
636: struct AtomicBAND {
637: __device__ Type operator()(Type &x, Type y) const { return atomicAnd(&x, y); }
638: };
639: template <typename Type>
640: struct AtomicBOR {
641: __device__ Type operator()(Type &x, Type y) const { return atomicOr(&x, y); }
642: };
643: template <typename Type>
644: struct AtomicBXOR {
645: __device__ Type operator()(Type &x, Type y) const { return atomicXor(&x, y); }
646: };
648: /*
649: Atomic logical operations:
651: CUDA has no atomic logical operations at all. We support them on integer types.
652: */
654: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
655: which is what we want since we only support 32-bit and 64-bit integers.
656: */
657: template <typename Type, class Op, int size /* sizeof(Type) */>
658: struct AtomicLogical;
660: template <typename Type, class Op>
661: struct AtomicLogical<Type, Op, 4> {
662: __device__ Type operator()(Type &x, Type y) const
663: {
664: int *address_as_int = (int *)(&x);
665: int old = *address_as_int, assumed;
666: Op op;
667: do {
668: assumed = old;
669: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed, y)));
670: } while (assumed != old);
671: return (Type)old;
672: }
673: };
675: template <typename Type, class Op>
676: struct AtomicLogical<Type, Op, 8> {
677: __device__ Type operator()(Type &x, Type y) const
678: {
679: ullint *address_as_ull = (ullint *)(&x);
680: ullint old = *address_as_ull, assumed;
681: Op op;
682: do {
683: assumed = old;
684: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed, y)));
685: } while (assumed != old);
686: return (Type)old;
687: }
688: };
690: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
691: template <typename Type>
692: struct land {
693: __device__ Type operator()(Type x, Type y) { return x && y; }
694: };
695: template <typename Type>
696: struct lor {
697: __device__ Type operator()(Type x, Type y) { return x || y; }
698: };
699: template <typename Type>
700: struct lxor {
701: __device__ Type operator()(Type x, Type y) { return (!x != !y); }
702: };
704: template <typename Type>
705: struct AtomicLAND {
706: __device__ Type operator()(Type &x, Type y) const
707: {
708: AtomicLogical<Type, land<Type>, sizeof(Type)> op;
709: return op(x, y);
710: }
711: };
712: template <typename Type>
713: struct AtomicLOR {
714: __device__ Type operator()(Type &x, Type y) const
715: {
716: AtomicLogical<Type, lor<Type>, sizeof(Type)> op;
717: return op(x, y);
718: }
719: };
720: template <typename Type>
721: struct AtomicLXOR {
722: __device__ Type operator()(Type &x, Type y) const
723: {
724: AtomicLogical<Type, lxor<Type>, sizeof(Type)> op;
725: return op(x, y);
726: }
727: };
729: /*====================================================================================*/
730: /* Wrapper functions of cuda kernels. Function pointers are stored in 'link' */
731: /*====================================================================================*/
732: template <typename Type, PetscInt BS, PetscInt EQ>
733: static PetscErrorCode Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data, void *buf)
734: {
735: PetscInt nthreads = 256;
736: PetscInt nblocks = (count + nthreads - 1) / nthreads;
737: const PetscInt *iarray = opt ? opt->array : NULL;
739: PetscFunctionBegin;
740: if (!count) PetscFunctionReturn(PETSC_SUCCESS);
741: if (!opt && !idx) { /* It is a 'CUDA data to nvshmem buf' memory copy */
742: PetscCallCUDA(cudaMemcpyAsync(buf, (char *)data + start * link->unitbytes, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream));
743: } else {
744: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
745: d_Pack<Type, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (const Type *)data, (Type *)buf);
746: PetscCallCUDA(cudaGetLastError());
747: }
748: PetscFunctionReturn(PETSC_SUCCESS);
749: }
751: /* To specialize UnpackAndOp for the cudaMemcpyAsync() below. Usually if this is a contiguous memcpy, we use root/leafdirect and do
752: not need UnpackAndOp. Only with nvshmem, we need this 'nvshmem buf to CUDA data' memory copy
753: */
754: template <typename Type, PetscInt BS, PetscInt EQ>
755: static PetscErrorCode Unpack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf)
756: {
757: PetscInt nthreads = 256;
758: PetscInt nblocks = (count + nthreads - 1) / nthreads;
759: const PetscInt *iarray = opt ? opt->array : NULL;
761: PetscFunctionBegin;
762: if (!count) PetscFunctionReturn(PETSC_SUCCESS);
763: if (!opt && !idx) { /* It is a 'nvshmem buf to CUDA data' memory copy */
764: PetscCallCUDA(cudaMemcpyAsync((char *)data + start * link->unitbytes, buf, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream));
765: } else {
766: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
767: d_UnpackAndOp<Type, Insert<Type>, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf);
768: PetscCallCUDA(cudaGetLastError());
769: }
770: PetscFunctionReturn(PETSC_SUCCESS);
771: }
773: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
774: static PetscErrorCode UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, const void *buf)
775: {
776: PetscInt nthreads = 256;
777: PetscInt nblocks = (count + nthreads - 1) / nthreads;
778: const PetscInt *iarray = opt ? opt->array : NULL;
780: PetscFunctionBegin;
781: if (!count) PetscFunctionReturn(PETSC_SUCCESS);
782: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
783: d_UnpackAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (const Type *)buf);
784: PetscCallCUDA(cudaGetLastError());
785: PetscFunctionReturn(PETSC_SUCCESS);
786: }
788: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
789: static PetscErrorCode FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf)
790: {
791: PetscInt nthreads = 256;
792: PetscInt nblocks = (count + nthreads - 1) / nthreads;
793: const PetscInt *iarray = opt ? opt->array : NULL;
795: PetscFunctionBegin;
796: if (!count) PetscFunctionReturn(PETSC_SUCCESS);
797: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
798: d_FetchAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, start, iarray, idx, (Type *)data, (Type *)buf);
799: PetscCallCUDA(cudaGetLastError());
800: PetscFunctionReturn(PETSC_SUCCESS);
801: }
803: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
804: static PetscErrorCode ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
805: {
806: PetscInt nthreads = 256;
807: PetscInt nblocks = (count + nthreads - 1) / nthreads;
808: PetscInt srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0;
810: PetscFunctionBegin;
811: if (!count) PetscFunctionReturn(PETSC_SUCCESS);
812: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
814: /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
815: if (srcOpt) {
816: srcx = srcOpt->dx[0];
817: srcy = srcOpt->dy[0];
818: srcX = srcOpt->X[0];
819: srcY = srcOpt->Y[0];
820: srcStart = srcOpt->start[0];
821: srcIdx = NULL;
822: } else if (!srcIdx) {
823: srcx = srcX = count;
824: srcy = srcY = 1;
825: }
827: if (dstOpt) {
828: dstx = dstOpt->dx[0];
829: dsty = dstOpt->dy[0];
830: dstX = dstOpt->X[0];
831: dstY = dstOpt->Y[0];
832: dstStart = dstOpt->start[0];
833: dstIdx = NULL;
834: } else if (!dstIdx) {
835: dstx = dstX = count;
836: dsty = dstY = 1;
837: }
839: d_ScatterAndOp<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, srcx, srcy, srcX, srcY, srcStart, srcIdx, (const Type *)src, dstx, dsty, dstX, dstY, dstStart, dstIdx, (Type *)dst);
840: PetscCallCUDA(cudaGetLastError());
841: PetscFunctionReturn(PETSC_SUCCESS);
842: }
844: /* Specialization for Insert since we may use cudaMemcpyAsync */
845: template <typename Type, PetscInt BS, PetscInt EQ>
846: static PetscErrorCode ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst)
847: {
848: PetscFunctionBegin;
849: if (!count) PetscFunctionReturn(PETSC_SUCCESS);
850: /*src and dst are contiguous */
851: if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
852: PetscCallCUDA(cudaMemcpyAsync((Type *)dst + dstStart * link->bs, (const Type *)src + srcStart * link->bs, count * link->unitbytes, cudaMemcpyDeviceToDevice, link->stream));
853: } else {
854: PetscCall(ScatterAndOp<Type, Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst));
855: }
856: PetscFunctionReturn(PETSC_SUCCESS);
857: }
859: template <typename Type, class Op, PetscInt BS, PetscInt EQ>
860: static PetscErrorCode FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate)
861: {
862: PetscInt nthreads = 256;
863: PetscInt nblocks = (count + nthreads - 1) / nthreads;
864: const PetscInt *rarray = rootopt ? rootopt->array : NULL;
865: const PetscInt *larray = leafopt ? leafopt->array : NULL;
867: PetscFunctionBegin;
868: if (!count) PetscFunctionReturn(PETSC_SUCCESS);
869: nblocks = PetscMin(nblocks, link->maxResidentThreadsPerGPU / nthreads);
870: d_FetchAndOpLocal<Type, Op, BS, EQ><<<nblocks, nthreads, 0, link->stream>>>(link->bs, count, rootstart, rarray, rootidx, (Type *)rootdata, leafstart, larray, leafidx, (const Type *)leafdata, (Type *)leafupdate);
871: PetscCallCUDA(cudaGetLastError());
872: PetscFunctionReturn(PETSC_SUCCESS);
873: }
875: /*====================================================================================*/
876: /* Init various types and instantiate pack/unpack function pointers */
877: /*====================================================================================*/
878: template <typename Type, PetscInt BS, PetscInt EQ>
879: static void PackInit_RealType(PetscSFLink link)
880: {
881: /* Pack/unpack for remote communication */
882: link->d_Pack = Pack<Type, BS, EQ>;
883: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
884: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
885: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
886: link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>;
887: link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>;
888: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
890: /* Scatter for local communication */
891: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */
892: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
893: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
894: link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>;
895: link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>;
896: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
898: /* Atomic versions when there are data-race possibilities */
899: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
900: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
901: link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
902: link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
903: link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
904: link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
906: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
907: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
908: link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
909: link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
910: link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
911: link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
912: }
914: /* Have this templated class to specialize for char integers */
915: template <typename Type, PetscInt BS, PetscInt EQ, PetscInt size /*sizeof(Type)*/>
916: struct PackInit_IntegerType_Atomic {
917: static void Init(PetscSFLink link)
918: {
919: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
920: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
921: link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
922: link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
923: link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
924: link->da_UnpackAndLAND = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>;
925: link->da_UnpackAndLOR = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>;
926: link->da_UnpackAndLXOR = UnpackAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
927: link->da_UnpackAndBAND = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>;
928: link->da_UnpackAndBOR = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>;
929: link->da_UnpackAndBXOR = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
930: link->da_FetchAndAdd = FetchAndOp<Type, AtomicAdd<Type>, BS, EQ>;
932: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
933: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
934: link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
935: link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
936: link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
937: link->da_ScatterAndLAND = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>;
938: link->da_ScatterAndLOR = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>;
939: link->da_ScatterAndLXOR = ScatterAndOp<Type, AtomicLXOR<Type>, BS, EQ>;
940: link->da_ScatterAndBAND = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>;
941: link->da_ScatterAndBOR = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>;
942: link->da_ScatterAndBXOR = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
943: link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicAdd<Type>, BS, EQ>;
944: }
945: };
947: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
948: template <typename Type, PetscInt BS, PetscInt EQ>
949: struct PackInit_IntegerType_Atomic<Type, BS, EQ, 1> {
950: static void Init(PetscSFLink)
951: { /* Nothing to leave function pointers NULL */
952: }
953: };
955: template <typename Type, PetscInt BS, PetscInt EQ>
956: static void PackInit_IntegerType(PetscSFLink link)
957: {
958: link->d_Pack = Pack<Type, BS, EQ>;
959: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
960: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
961: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
962: link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>;
963: link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>;
964: link->d_UnpackAndLAND = UnpackAndOp<Type, LAND<Type>, BS, EQ>;
965: link->d_UnpackAndLOR = UnpackAndOp<Type, LOR<Type>, BS, EQ>;
966: link->d_UnpackAndLXOR = UnpackAndOp<Type, LXOR<Type>, BS, EQ>;
967: link->d_UnpackAndBAND = UnpackAndOp<Type, BAND<Type>, BS, EQ>;
968: link->d_UnpackAndBOR = UnpackAndOp<Type, BOR<Type>, BS, EQ>;
969: link->d_UnpackAndBXOR = UnpackAndOp<Type, BXOR<Type>, BS, EQ>;
970: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
972: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
973: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
974: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
975: link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>;
976: link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>;
977: link->d_ScatterAndLAND = ScatterAndOp<Type, LAND<Type>, BS, EQ>;
978: link->d_ScatterAndLOR = ScatterAndOp<Type, LOR<Type>, BS, EQ>;
979: link->d_ScatterAndLXOR = ScatterAndOp<Type, LXOR<Type>, BS, EQ>;
980: link->d_ScatterAndBAND = ScatterAndOp<Type, BAND<Type>, BS, EQ>;
981: link->d_ScatterAndBOR = ScatterAndOp<Type, BOR<Type>, BS, EQ>;
982: link->d_ScatterAndBXOR = ScatterAndOp<Type, BXOR<Type>, BS, EQ>;
983: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
984: PackInit_IntegerType_Atomic<Type, BS, EQ, sizeof(Type)>::Init(link);
985: }
987: #if defined(PETSC_HAVE_COMPLEX)
988: template <typename Type, PetscInt BS, PetscInt EQ>
989: static void PackInit_ComplexType(PetscSFLink link)
990: {
991: link->d_Pack = Pack<Type, BS, EQ>;
992: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
993: link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>;
994: link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
995: link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>;
997: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
998: link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>;
999: link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
1000: link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
1002: link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
1003: link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
1004: link->da_UnpackAndMult = NULL; /* Not implemented yet */
1005: link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
1007: link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
1008: link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
1009: }
1010: #endif
1012: typedef signed char SignedChar;
1013: typedef unsigned char UnsignedChar;
1014: typedef struct {
1015: int a;
1016: int b;
1017: } PairInt;
1018: typedef struct {
1019: PetscInt a;
1020: PetscInt b;
1021: } PairPetscInt;
1023: template <typename Type>
1024: static void PackInit_PairType(PetscSFLink link)
1025: {
1026: link->d_Pack = Pack<Type, 1, 1>;
1027: link->d_UnpackAndInsert = Unpack<Type, 1, 1>;
1028: link->d_UnpackAndMaxloc = UnpackAndOp<Type, Maxloc<Type>, 1, 1>;
1029: link->d_UnpackAndMinloc = UnpackAndOp<Type, Minloc<Type>, 1, 1>;
1031: link->d_ScatterAndInsert = ScatterAndOp<Type, Insert<Type>, 1, 1>;
1032: link->d_ScatterAndMaxloc = ScatterAndOp<Type, Maxloc<Type>, 1, 1>;
1033: link->d_ScatterAndMinloc = ScatterAndOp<Type, Minloc<Type>, 1, 1>;
1034: /* Atomics for pair types are not implemented yet */
1035: }
1037: template <typename Type, PetscInt BS, PetscInt EQ>
1038: static void PackInit_DumbType(PetscSFLink link)
1039: {
1040: link->d_Pack = Pack<Type, BS, EQ>;
1041: link->d_UnpackAndInsert = Unpack<Type, BS, EQ>;
1042: link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
1043: /* Atomics for dumb types are not implemented yet */
1044: }
1046: /* Some device-specific utilities */
1047: static PetscErrorCode PetscSFLinkSyncDevice_CUDA(PetscSFLink)
1048: {
1049: PetscFunctionBegin;
1050: PetscCallCUDA(cudaDeviceSynchronize());
1051: PetscFunctionReturn(PETSC_SUCCESS);
1052: }
1054: static PetscErrorCode PetscSFLinkSyncStream_CUDA(PetscSFLink link)
1055: {
1056: PetscFunctionBegin;
1057: PetscCallCUDA(cudaStreamSynchronize(link->stream));
1058: PetscFunctionReturn(PETSC_SUCCESS);
1059: }
1061: static PetscErrorCode PetscSFLinkMemcpy_CUDA(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
1062: {
1063: PetscFunctionBegin;
1064: enum cudaMemcpyKind kinds[2][2] = {
1065: {cudaMemcpyHostToHost, cudaMemcpyHostToDevice },
1066: {cudaMemcpyDeviceToHost, cudaMemcpyDeviceToDevice}
1067: };
1069: if (n) {
1070: if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call cuda runtime */
1071: PetscCall(PetscMemcpy(dst, src, n));
1072: } else {
1073: int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
1074: int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
1075: PetscCallCUDA(cudaMemcpyAsync(dst, src, n, kinds[stype][dtype], link->stream));
1076: }
1077: }
1078: PetscFunctionReturn(PETSC_SUCCESS);
1079: }
1081: PetscErrorCode PetscSFMalloc_CUDA(PetscMemType mtype, size_t size, void **ptr)
1082: {
1083: PetscFunctionBegin;
1084: if (PetscMemTypeHost(mtype)) PetscCall(PetscMalloc(size, ptr));
1085: else if (PetscMemTypeDevice(mtype)) {
1086: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1087: PetscCallCUDA(cudaMalloc(ptr, size));
1088: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
1089: PetscFunctionReturn(PETSC_SUCCESS);
1090: }
1092: PetscErrorCode PetscSFFree_CUDA(PetscMemType mtype, void *ptr)
1093: {
1094: PetscFunctionBegin;
1095: if (PetscMemTypeHost(mtype)) PetscCall(PetscFree(ptr));
1096: else if (PetscMemTypeDevice(mtype)) PetscCallCUDA(cudaFree(ptr));
1097: else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
1098: PetscFunctionReturn(PETSC_SUCCESS);
1099: }
1101: /* Destructor when the link uses MPI for communication on CUDA device */
1102: static PetscErrorCode PetscSFLinkDestroy_MPI_CUDA(PetscSF, PetscSFLink link)
1103: {
1104: PetscFunctionBegin;
1105: for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1106: PetscCallCUDA(cudaFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
1107: PetscCallCUDA(cudaFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
1108: }
1109: PetscFunctionReturn(PETSC_SUCCESS);
1110: }
1112: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
1113: PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
1114: {
1115: PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
1116: PetscBool is2Int, is2PetscInt;
1117: #if defined(PETSC_HAVE_COMPLEX)
1118: PetscInt nPetscComplex = 0;
1119: #endif
1121: PetscFunctionBegin;
1122: if (link->deviceinited) PetscFunctionReturn(PETSC_SUCCESS);
1123: PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
1124: PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
1125: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
1126: PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
1127: PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
1128: PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
1129: #if defined(PETSC_HAVE_COMPLEX)
1130: PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
1131: #endif
1132: PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
1133: PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
1135: if (is2Int) {
1136: PackInit_PairType<PairInt>(link);
1137: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
1138: PackInit_PairType<PairPetscInt>(link);
1139: } else if (nPetscReal) {
1140: #if !defined(PETSC_HAVE_DEVICE)
1141: if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link);
1142: else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link);
1143: else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link);
1144: else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link);
1145: else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link);
1146: else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link);
1147: else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link);
1148: else if (nPetscReal % 1 == 0)
1149: #endif
1150: PackInit_RealType<PetscReal, 1, 0>(link);
1151: } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
1152: #if !defined(PETSC_HAVE_DEVICE)
1153: if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link);
1154: else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link);
1155: else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link);
1156: else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link);
1157: else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link);
1158: else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link);
1159: else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link);
1160: else if (nPetscInt % 1 == 0)
1161: #endif
1162: PackInit_IntegerType<llint, 1, 0>(link);
1163: } else if (nInt) {
1164: #if !defined(PETSC_HAVE_DEVICE)
1165: if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link);
1166: else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link);
1167: else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link);
1168: else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link);
1169: else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link);
1170: else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link);
1171: else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link);
1172: else if (nInt % 1 == 0)
1173: #endif
1174: PackInit_IntegerType<int, 1, 0>(link);
1175: } else if (nSignedChar) {
1176: #if !defined(PETSC_HAVE_DEVICE)
1177: if (nSignedChar == 8) PackInit_IntegerType<SignedChar, 8, 1>(link);
1178: else if (nSignedChar % 8 == 0) PackInit_IntegerType<SignedChar, 8, 0>(link);
1179: else if (nSignedChar == 4) PackInit_IntegerType<SignedChar, 4, 1>(link);
1180: else if (nSignedChar % 4 == 0) PackInit_IntegerType<SignedChar, 4, 0>(link);
1181: else if (nSignedChar == 2) PackInit_IntegerType<SignedChar, 2, 1>(link);
1182: else if (nSignedChar % 2 == 0) PackInit_IntegerType<SignedChar, 2, 0>(link);
1183: else if (nSignedChar == 1) PackInit_IntegerType<SignedChar, 1, 1>(link);
1184: else if (nSignedChar % 1 == 0)
1185: #endif
1186: PackInit_IntegerType<SignedChar, 1, 0>(link);
1187: } else if (nUnsignedChar) {
1188: #if !defined(PETSC_HAVE_DEVICE)
1189: if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar, 8, 1>(link);
1190: else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<UnsignedChar, 8, 0>(link);
1191: else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar, 4, 1>(link);
1192: else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<UnsignedChar, 4, 0>(link);
1193: else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar, 2, 1>(link);
1194: else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<UnsignedChar, 2, 0>(link);
1195: else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar, 1, 1>(link);
1196: else if (nUnsignedChar % 1 == 0)
1197: #endif
1198: PackInit_IntegerType<UnsignedChar, 1, 0>(link);
1199: #if defined(PETSC_HAVE_COMPLEX)
1200: } else if (nPetscComplex) {
1201: #if !defined(PETSC_HAVE_DEVICE)
1202: if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex, 8, 1>(link);
1203: else if (nPetscComplex % 8 == 0) PackInit_ComplexType<PetscComplex, 8, 0>(link);
1204: else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex, 4, 1>(link);
1205: else if (nPetscComplex % 4 == 0) PackInit_ComplexType<PetscComplex, 4, 0>(link);
1206: else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex, 2, 1>(link);
1207: else if (nPetscComplex % 2 == 0) PackInit_ComplexType<PetscComplex, 2, 0>(link);
1208: else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex, 1, 1>(link);
1209: else if (nPetscComplex % 1 == 0)
1210: #endif
1211: PackInit_ComplexType<PetscComplex, 1, 0>(link);
1212: #endif
1213: } else {
1214: MPI_Aint lb, nbyte;
1215: PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
1216: PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
1217: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
1218: #if !defined(PETSC_HAVE_DEVICE)
1219: if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link);
1220: else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link);
1221: else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link);
1222: else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link);
1223: else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link);
1224: else if (nbyte % 1 == 0)
1225: #endif
1226: PackInit_DumbType<char, 1, 0>(link);
1227: } else {
1228: nInt = nbyte / sizeof(int);
1229: #if !defined(PETSC_HAVE_DEVICE)
1230: if (nInt == 8) PackInit_DumbType<int, 8, 1>(link);
1231: else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link);
1232: else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link);
1233: else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link);
1234: else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link);
1235: else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link);
1236: else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link);
1237: else if (nInt % 1 == 0)
1238: #endif
1239: PackInit_DumbType<int, 1, 0>(link);
1240: }
1241: }
1243: if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
1244: int device;
1245: struct cudaDeviceProp props;
1246: PetscCallCUDA(cudaGetDevice(&device));
1247: PetscCallCUDA(cudaGetDeviceProperties(&props, device));
1248: sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor * props.multiProcessorCount;
1249: }
1250: link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
1252: link->stream = PetscDefaultCudaStream;
1253: link->Destroy = PetscSFLinkDestroy_MPI_CUDA;
1254: link->SyncDevice = PetscSFLinkSyncDevice_CUDA;
1255: link->SyncStream = PetscSFLinkSyncStream_CUDA;
1256: link->Memcpy = PetscSFLinkMemcpy_CUDA;
1257: link->deviceinited = PETSC_TRUE;
1258: PetscFunctionReturn(PETSC_SUCCESS);
1259: }