Actual source code: sfpack.c
1: #include "petsc/private/sfimpl.h"
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
7: /*
8: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9: * therefore we pack data types manually. This file defines packing routines for the standard data types.
10: */
12: #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
14: /* Operations working like s += t */
15: #define OP_BINARY(op, s, t) \
16: do { \
17: (s) = (s)op(t); \
18: } while (0) /* binary ops in the middle such as +, *, && etc. */
19: #define OP_FUNCTION(op, s, t) \
20: do { \
21: (s) = op((s), (t)); \
22: } while (0) /* ops like a function, such as PetscMax, PetscMin */
23: #define OP_LXOR(op, s, t) \
24: do { \
25: (s) = (!(s)) != (!(t)); \
26: } while (0) /* logical exclusive OR */
27: #define OP_ASSIGN(op, s, t) \
28: do { \
29: (s) = (t); \
30: } while (0)
31: /* Ref MPI MAXLOC */
32: #define OP_XLOC(op, s, t) \
33: do { \
34: if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
35: else if (!((s).u op(t).u)) s = t; \
36: } while (0)
38: /* DEF_PackFunc - macro defining a Pack routine
40: Arguments of the macro:
41: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
42: .BS Block size for vectorization. It is a factor of bsz.
43: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
45: Arguments of the Pack routine:
46: +count Number of indices in idx[].
47: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
48: .opt Per-pack optimization plan. NULL means no such plan.
49: .idx Indices of entries to packed.
50: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
51: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
52: -packed Address of the packed data.
53: */
54: #define DEF_PackFunc(Type, BS, EQ) \
55: static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
56: { \
57: const Type *u = (const Type *)unpacked, *u2; \
58: Type *p = (Type *)packed, *p2; \
59: PetscInt i, j, k, X, Y, r, bs = link->bs; \
60: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
61: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
62: PetscFunctionBegin; \
63: if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
64: else if (opt) { /* has optimizations available */ p2 = p; \
65: for (r = 0; r < opt->n; r++) { \
66: u2 = u + opt->start[r] * MBS; \
67: X = opt->X[r]; \
68: Y = opt->Y[r]; \
69: for (k = 0; k < opt->dz[r]; k++) \
70: for (j = 0; j < opt->dy[r]; j++) { \
71: PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
72: p2 += opt->dx[r] * MBS; \
73: } \
74: } \
75: } else { \
76: for (i = 0; i < count; i++) \
77: for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
78: for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
79: p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
80: } \
81: PetscFunctionReturn(PETSC_SUCCESS); \
82: }
84: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
85: and inserts into a sparse array.
87: Arguments:
88: .Type Type of the data
89: .BS Block size for vectorization
90: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
92: Notes:
93: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
94: */
95: #define DEF_UnpackFunc(Type, BS, EQ) \
96: static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
97: { \
98: Type *u = (Type *)unpacked, *u2; \
99: const Type *p = (const Type *)packed; \
100: PetscInt i, j, k, X, Y, r, bs = link->bs; \
101: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
102: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
103: PetscFunctionBegin; \
104: if (!idx) { \
105: u += start * MBS; \
106: if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \
107: } else if (opt) { /* has optimizations available */ \
108: for (r = 0; r < opt->n; r++) { \
109: u2 = u + opt->start[r] * MBS; \
110: X = opt->X[r]; \
111: Y = opt->Y[r]; \
112: for (k = 0; k < opt->dz[r]; k++) \
113: for (j = 0; j < opt->dy[r]; j++) { \
114: PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
115: p += opt->dx[r] * MBS; \
116: } \
117: } \
118: } else { \
119: for (i = 0; i < count; i++) \
120: for (j = 0; j < M; j++) \
121: for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
122: } \
123: PetscFunctionReturn(PETSC_SUCCESS); \
124: }
126: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
128: Arguments:
129: +Opname Name of the Op, such as Add, Mult, LAND, etc.
130: .Type Type of the data
131: .BS Block size for vectorization
132: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
133: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
134: .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
135: */
136: #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
137: static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
138: { \
139: Type *u = (Type *)unpacked, *u2; \
140: const Type *p = (const Type *)packed; \
141: PetscInt i, j, k, X, Y, r, bs = link->bs; \
142: const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
143: const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
144: PetscFunctionBegin; \
145: if (!idx) { \
146: u += start * MBS; \
147: for (i = 0; i < count; i++) \
148: for (j = 0; j < M; j++) \
149: for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
150: } else if (opt) { /* idx[] has patterns */ \
151: for (r = 0; r < opt->n; r++) { \
152: u2 = u + opt->start[r] * MBS; \
153: X = opt->X[r]; \
154: Y = opt->Y[r]; \
155: for (k = 0; k < opt->dz[r]; k++) \
156: for (j = 0; j < opt->dy[r]; j++) { \
157: for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
158: p += opt->dx[r] * MBS; \
159: } \
160: } \
161: } else { \
162: for (i = 0; i < count; i++) \
163: for (j = 0; j < M; j++) \
164: for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
165: } \
166: PetscFunctionReturn(PETSC_SUCCESS); \
167: }
169: #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
170: static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
171: { \
172: Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
173: PetscInt i, j, k, r, l, bs = link->bs; \
174: const PetscInt M = (EQ) ? 1 : bs / BS; \
175: const PetscInt MBS = M * BS; \
176: PetscFunctionBegin; \
177: for (i = 0; i < count; i++) { \
178: r = (!idx ? start + i : idx[i]) * MBS; \
179: l = i * MBS; \
180: for (j = 0; j < M; j++) \
181: for (k = 0; k < BS; k++) { \
182: tmp = u[r + j * BS + k]; \
183: OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
184: p[l + j * BS + k] = tmp; \
185: } \
186: } \
187: PetscFunctionReturn(PETSC_SUCCESS); \
188: }
190: #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
191: static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
192: { \
193: const Type *u = (const Type *)src; \
194: Type *v = (Type *)dst; \
195: PetscInt i, j, k, s, t, X, Y, bs = link->bs; \
196: const PetscInt M = (EQ) ? 1 : bs / BS; \
197: const PetscInt MBS = M * BS; \
198: PetscFunctionBegin; \
199: if (!srcIdx) { /* src is contiguous */ \
200: u += srcStart * MBS; \
201: PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
202: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
203: u += srcOpt->start[0] * MBS; \
204: v += dstStart * MBS; \
205: X = srcOpt->X[0]; \
206: Y = srcOpt->Y[0]; \
207: for (k = 0; k < srcOpt->dz[0]; k++) \
208: for (j = 0; j < srcOpt->dy[0]; j++) { \
209: for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
210: v += srcOpt->dx[0] * MBS; \
211: } \
212: } else { /* all other cases */ \
213: for (i = 0; i < count; i++) { \
214: s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
215: t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
216: for (j = 0; j < M; j++) \
217: for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
218: } \
219: } \
220: PetscFunctionReturn(PETSC_SUCCESS); \
221: }
223: #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
224: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
225: { \
226: Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
227: const Type *ldata = (const Type *)leafdata; \
228: PetscInt i, j, k, r, l, bs = link->bs; \
229: const PetscInt M = (EQ) ? 1 : bs / BS; \
230: const PetscInt MBS = M * BS; \
231: PetscFunctionBegin; \
232: for (i = 0; i < count; i++) { \
233: r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
234: l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
235: for (j = 0; j < M; j++) \
236: for (k = 0; k < BS; k++) { \
237: lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
238: OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
239: } \
240: } \
241: PetscFunctionReturn(PETSC_SUCCESS); \
242: }
244: /* Pack, Unpack/Fetch ops */
245: #define DEF_Pack(Type, BS, EQ) \
246: DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
247: { \
248: link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \
249: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
250: link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
251: }
253: /* Add, Mult ops */
254: #define DEF_Add(Type, BS, EQ) \
255: DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
256: { \
257: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
258: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
259: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
260: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
261: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
262: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
263: }
265: /* Max, Min ops */
266: #define DEF_Cmp(Type, BS, EQ) \
267: DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
268: { \
269: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
270: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
271: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
272: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
273: }
275: /* Logical ops.
276: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
277: the compilation warning "empty macro arguments are undefined in ISO C90"
278: */
279: #define DEF_Log(Type, BS, EQ) \
280: DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
281: { \
282: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
283: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
284: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
285: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
286: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
287: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
288: }
290: /* Bitwise ops */
291: #define DEF_Bit(Type, BS, EQ) \
292: DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
293: { \
294: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
295: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
296: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
297: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
298: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
299: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
300: }
302: /* Maxloc, Minloc ops */
303: #define DEF_Xloc(Type, BS, EQ) \
304: DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
305: { \
306: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
307: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
308: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
309: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
310: }
312: #define DEF_IntegerType(Type, BS, EQ) \
313: DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
314: { \
315: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
316: CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
317: CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
318: CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
319: CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
320: }
322: #define DEF_RealType(Type, BS, EQ) \
323: DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
324: { \
325: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
326: CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
327: CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
328: }
330: #if defined(PETSC_HAVE_COMPLEX)
331: #define DEF_ComplexType(Type, BS, EQ) \
332: DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
333: { \
334: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
335: CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
336: }
337: #endif
339: #define DEF_DumbType(Type, BS, EQ) \
340: DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
341: { \
342: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
343: }
345: /* Maxloc, Minloc */
346: #define DEF_PairType(Type, BS, EQ) \
347: DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
348: { \
349: CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
350: CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
351: }
353: DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */
354: DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
355: DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
356: DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
357: DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
358: DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
359: DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
360: DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
362: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
363: DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
364: #endif
366: /* The typedefs are used to get a typename without space that CPPJoin can handle */
367: typedef signed char SignedChar;
368: DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
370: typedef unsigned char UnsignedChar;
371: DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
373: DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
374: #if defined(PETSC_HAVE_COMPLEX)
375: DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
376: #endif
378: #define PairType(Type1, Type2) Type1##_##Type2
379: typedef struct {
380: int u;
381: int i;
382: } PairType(int, int);
383: typedef struct {
384: PetscInt u;
385: PetscInt i;
386: } PairType(PetscInt, PetscInt);
387: DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
389: /* If we don't know the basic type, we treat it as a stream of chars or ints */
390: DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
392: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
393: DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
395: PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
396: {
397: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
398: PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
400: PetscFunctionBegin;
401: /* Destroy device-specific fields */
402: if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));
404: /* Destroy host related fields */
405: if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
406: if (!link->use_nvshmem) {
407: for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
408: if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
409: }
410: PetscCall(PetscFree(link->reqs));
411: for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
412: PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
413: PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
414: }
415: }
416: PetscCall(PetscFree(link));
417: PetscFunctionReturn(PETSC_SUCCESS);
418: }
420: PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
421: {
422: PetscFunctionBegin;
423: PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
424: #if defined(PETSC_HAVE_NVSHMEM)
425: {
426: PetscBool use_nvshmem;
427: PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
428: if (use_nvshmem) {
429: PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
430: PetscFunctionReturn(PETSC_SUCCESS);
431: }
432: }
433: #endif
434: PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
435: PetscFunctionReturn(PETSC_SUCCESS);
436: }
438: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
439: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
440: */
441: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
442: {
443: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
444: PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
445: const PetscInt *rootoffset, *leafoffset;
446: MPI_Aint disp;
447: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
448: MPI_Datatype unit = link->unit;
449: const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
450: const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
452: PetscFunctionBegin;
453: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
454: if (sf->persistent) {
455: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
456: PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
457: if (direction == PETSCSF_LEAF2ROOT) {
458: for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
459: disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
460: cnt = rootoffset[i + 1] - rootoffset[i];
461: PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
462: }
463: } else { /* PETSCSF_ROOT2LEAF */
464: for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
465: disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
466: cnt = rootoffset[i + 1] - rootoffset[i];
467: PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
468: }
469: }
470: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
471: }
473: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
474: PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
475: if (direction == PETSCSF_LEAF2ROOT) {
476: for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
477: disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
478: cnt = leafoffset[i + 1] - leafoffset[i];
479: PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
480: }
481: } else { /* PETSCSF_ROOT2LEAF */
482: for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
483: disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
484: cnt = leafoffset[i + 1] - leafoffset[i];
485: PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
486: }
487: }
488: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
489: }
490: }
491: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
492: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
493: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
494: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
495: PetscFunctionReturn(PETSC_SUCCESS);
496: }
498: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
499: {
500: PetscSFLink link, *p;
501: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
503: PetscFunctionBegin;
504: /* Look for types in cache */
505: for (p = &bas->inuse; (link = *p); p = &link->next) {
506: PetscBool match;
507: PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
508: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
509: switch (cmode) {
510: case PETSC_OWN_POINTER:
511: *p = link->next;
512: break; /* Remove from inuse list */
513: case PETSC_USE_POINTER:
514: break;
515: default:
516: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
517: }
518: *mylink = link;
519: PetscFunctionReturn(PETSC_SUCCESS);
520: }
521: }
522: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
523: }
525: PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
526: {
527: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
528: PetscSFLink link = *mylink;
530: PetscFunctionBegin;
531: link->rootdata = NULL;
532: link->leafdata = NULL;
533: link->next = bas->avail;
534: bas->avail = link;
535: *mylink = NULL;
536: PetscFunctionReturn(PETSC_SUCCESS);
537: }
539: /* Error out on unsupported overlapped communications */
540: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
541: {
542: PetscSFLink link, *p;
543: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
544: PetscBool match;
546: PetscFunctionBegin;
547: if (PetscDefined(USE_DEBUG)) {
548: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
549: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
550: */
551: if (rootdata || leafdata) {
552: for (p = &bas->inuse; (link = *p); p = &link->next) {
553: PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
554: PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
555: }
556: }
557: }
558: PetscFunctionReturn(PETSC_SUCCESS);
559: }
561: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
562: {
563: PetscFunctionBegin;
564: if (n) PetscCall(PetscMemcpy(dst, src, n));
565: PetscFunctionReturn(PETSC_SUCCESS);
566: }
568: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
569: {
570: PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
571: PetscBool is2Int, is2PetscInt;
572: PetscMPIInt ni, na, nd, combiner;
573: #if defined(PETSC_HAVE_COMPLEX)
574: PetscInt nPetscComplex = 0;
575: #endif
577: PetscFunctionBegin;
578: PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
579: PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
580: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
581: PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
582: PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
583: PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
584: #if defined(PETSC_HAVE_COMPLEX)
585: PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
586: #endif
587: PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
588: PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
589: /* TODO: shaell we also handle Fortran MPI_2REAL? */
590: PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner));
591: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
592: link->bs = 1; /* default */
594: if (is2Int) {
595: PackInit_PairType_int_int_1_1(link);
596: link->bs = 1;
597: link->unitbytes = 2 * sizeof(int);
598: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
599: link->basicunit = MPI_2INT;
600: link->unit = MPI_2INT;
601: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
602: PackInit_PairType_PetscInt_PetscInt_1_1(link);
603: link->bs = 1;
604: link->unitbytes = 2 * sizeof(PetscInt);
605: link->basicunit = MPIU_2INT;
606: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
607: link->unit = MPIU_2INT;
608: } else if (nPetscReal) {
609: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
610: else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
611: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
612: else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
613: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
614: else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
615: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
616: else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
617: link->bs = nPetscReal;
618: link->unitbytes = nPetscReal * sizeof(PetscReal);
619: link->basicunit = MPIU_REAL;
620: if (link->bs == 1) {
621: link->isbuiltin = PETSC_TRUE;
622: link->unit = MPIU_REAL;
623: }
624: } else if (nPetscInt) {
625: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
626: else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
627: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
628: else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
629: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
630: else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
631: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
632: else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
633: link->bs = nPetscInt;
634: link->unitbytes = nPetscInt * sizeof(PetscInt);
635: link->basicunit = MPIU_INT;
636: if (link->bs == 1) {
637: link->isbuiltin = PETSC_TRUE;
638: link->unit = MPIU_INT;
639: }
640: #if defined(PETSC_USE_64BIT_INDICES)
641: } else if (nInt) {
642: if (nInt == 8) PackInit_IntegerType_int_8_1(link);
643: else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
644: else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
645: else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
646: else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
647: else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
648: else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
649: else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
650: link->bs = nInt;
651: link->unitbytes = nInt * sizeof(int);
652: link->basicunit = MPI_INT;
653: if (link->bs == 1) {
654: link->isbuiltin = PETSC_TRUE;
655: link->unit = MPI_INT;
656: }
657: #endif
658: } else if (nSignedChar) {
659: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
660: else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
661: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
662: else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
663: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
664: else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
665: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
666: else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
667: link->bs = nSignedChar;
668: link->unitbytes = nSignedChar * sizeof(SignedChar);
669: link->basicunit = MPI_SIGNED_CHAR;
670: if (link->bs == 1) {
671: link->isbuiltin = PETSC_TRUE;
672: link->unit = MPI_SIGNED_CHAR;
673: }
674: } else if (nUnsignedChar) {
675: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
676: else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
677: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
678: else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
679: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
680: else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
681: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
682: else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
683: link->bs = nUnsignedChar;
684: link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
685: link->basicunit = MPI_UNSIGNED_CHAR;
686: if (link->bs == 1) {
687: link->isbuiltin = PETSC_TRUE;
688: link->unit = MPI_UNSIGNED_CHAR;
689: }
690: #if defined(PETSC_HAVE_COMPLEX)
691: } else if (nPetscComplex) {
692: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
693: else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
694: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
695: else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
696: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
697: else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
698: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
699: else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
700: link->bs = nPetscComplex;
701: link->unitbytes = nPetscComplex * sizeof(PetscComplex);
702: link->basicunit = MPIU_COMPLEX;
703: if (link->bs == 1) {
704: link->isbuiltin = PETSC_TRUE;
705: link->unit = MPIU_COMPLEX;
706: }
707: #endif
708: } else {
709: MPI_Aint lb, nbyte;
710: PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
711: PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
712: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
713: if (nbyte == 4) PackInit_DumbType_char_4_1(link);
714: else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
715: else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
716: else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
717: else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
718: else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
719: link->bs = nbyte;
720: link->unitbytes = nbyte;
721: link->basicunit = MPI_BYTE;
722: } else {
723: nInt = nbyte / sizeof(int);
724: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
725: else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
726: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
727: else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
728: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
729: else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
730: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
731: else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
732: link->bs = nInt;
733: link->unitbytes = nbyte;
734: link->basicunit = MPI_INT;
735: }
736: if (link->isbuiltin) link->unit = unit;
737: }
739: if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));
741: link->Memcpy = PetscSFLinkMemcpy_Host;
742: PetscFunctionReturn(PETSC_SUCCESS);
743: }
745: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
746: {
747: PetscFunctionBegin;
748: *UnpackAndOp = NULL;
749: if (PetscMemTypeHost(mtype)) {
750: if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
751: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
752: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
753: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
754: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
755: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
756: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
757: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
758: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
759: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
760: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
761: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
762: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
763: }
764: #if defined(PETSC_HAVE_DEVICE)
765: else if (PetscMemTypeDevice(mtype) && !atomic) {
766: if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
767: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
768: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
769: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
770: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
771: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
772: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
773: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
774: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
775: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
776: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
777: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
778: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
779: } else if (PetscMemTypeDevice(mtype) && atomic) {
780: if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
781: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
782: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
783: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
784: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
785: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
786: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
787: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
788: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
789: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
790: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
791: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
792: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
793: }
794: #endif
795: PetscFunctionReturn(PETSC_SUCCESS);
796: }
798: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
799: {
800: PetscFunctionBegin;
801: *ScatterAndOp = NULL;
802: if (PetscMemTypeHost(mtype)) {
803: if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
804: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
805: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
806: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
807: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
808: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
809: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
810: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
811: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
812: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
813: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
814: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
815: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
816: }
817: #if defined(PETSC_HAVE_DEVICE)
818: else if (PetscMemTypeDevice(mtype) && !atomic) {
819: if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
820: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
821: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
822: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
823: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
824: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
825: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
826: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
827: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
828: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
829: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
830: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
831: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
832: } else if (PetscMemTypeDevice(mtype) && atomic) {
833: if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
834: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
835: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
836: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
837: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
838: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
839: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
840: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
841: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
842: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
843: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
844: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
845: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
846: }
847: #endif
848: PetscFunctionReturn(PETSC_SUCCESS);
849: }
851: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
852: {
853: PetscFunctionBegin;
854: *FetchAndOp = NULL;
855: PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
856: if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
857: #if defined(PETSC_HAVE_DEVICE)
858: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
859: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
860: #endif
861: PetscFunctionReturn(PETSC_SUCCESS);
862: }
864: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
865: {
866: PetscFunctionBegin;
867: *FetchAndOpLocal = NULL;
868: PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
869: if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
870: #if defined(PETSC_HAVE_DEVICE)
871: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
872: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
873: #endif
874: PetscFunctionReturn(PETSC_SUCCESS);
875: }
877: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
878: {
879: PetscLogDouble flops;
880: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
882: PetscFunctionBegin;
883: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
884: flops = bas->rootbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */
885: #if defined(PETSC_HAVE_DEVICE)
886: if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops));
887: else
888: #endif
889: PetscCall(PetscLogFlops(flops));
890: }
891: PetscFunctionReturn(PETSC_SUCCESS);
892: }
894: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
895: {
896: PetscLogDouble flops;
898: PetscFunctionBegin;
899: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
900: flops = sf->leafbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */
901: #if defined(PETSC_HAVE_DEVICE)
902: if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops));
903: else
904: #endif
905: PetscCall(PetscLogFlops(flops));
906: }
907: PetscFunctionReturn(PETSC_SUCCESS);
908: }
910: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
911: Input Parameters:
912: +sf - The StarForest
913: .link - The link
914: .count - Number of entries to unpack
915: .start - The first index, significant when indices=NULL
916: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
917: .buf - A contiguous buffer to unpack from
918: -op - Operation after unpack
920: Output Parameters:
921: .data - The data to unpack to
922: */
923: static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
924: {
925: PetscFunctionBegin;
926: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
927: {
928: PetscInt i;
929: if (indices) {
930: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
931: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
932: */
933: for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
934: } else {
935: PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
936: }
937: }
938: PetscFunctionReturn(PETSC_SUCCESS);
939: #else
940: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
941: #endif
942: }
944: static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
945: {
946: PetscFunctionBegin;
947: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
948: {
949: PetscInt i, disp;
950: if (!srcIdx) {
951: PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
952: } else {
953: for (i = 0; i < count; i++) {
954: disp = dstIdx ? dstIdx[i] : dstStart + i;
955: PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
956: }
957: }
958: }
959: PetscFunctionReturn(PETSC_SUCCESS);
960: #else
961: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
962: #endif
963: }
965: /*=============================================================================
966: Pack/Unpack/Fetch/Scatter routines
967: ============================================================================*/
969: /* Pack rootdata to rootbuf
970: Input Parameters:
971: + sf - The SF this packing works on.
972: . link - It gives the memtype of the roots and also provides root buffer.
973: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
974: - rootdata - Where to read the roots.
976: Notes:
977: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
978: in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
979: */
980: static PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
981: {
982: const PetscInt *rootindices = NULL;
983: PetscInt count, start;
984: PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
985: PetscMemType rootmtype = link->rootmtype;
986: PetscSFPackOpt opt = NULL;
988: PetscFunctionBegin;
989: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
990: PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
991: PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
992: PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
993: }
994: PetscFunctionReturn(PETSC_SUCCESS);
995: }
997: /* Pack leafdata to leafbuf */
998: static PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
999: {
1000: const PetscInt *leafindices = NULL;
1001: PetscInt count, start;
1002: PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1003: PetscMemType leafmtype = link->leafmtype;
1004: PetscSFPackOpt opt = NULL;
1006: PetscFunctionBegin;
1007: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1008: PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
1009: PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
1010: PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1011: }
1012: PetscFunctionReturn(PETSC_SUCCESS);
1013: }
1015: /* Pack rootdata to rootbuf, which are in the same memory space */
1016: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
1017: {
1018: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1020: PetscFunctionBegin;
1021: if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1022: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1023: if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
1024: }
1025: PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1026: if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
1027: PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1028: PetscFunctionReturn(PETSC_SUCCESS);
1029: }
1030: /* Pack leafdata to leafbuf, which are in the same memory space */
1031: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1032: {
1033: PetscFunctionBegin;
1034: if (scope == PETSCSF_REMOTE) {
1035: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1036: if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
1037: }
1038: PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
1039: if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
1040: PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
1041: PetscFunctionReturn(PETSC_SUCCESS);
1042: }
1044: static PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1045: {
1046: const PetscInt *rootindices = NULL;
1047: PetscInt count, start;
1048: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1049: PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1050: PetscMemType rootmtype = link->rootmtype;
1051: PetscSFPackOpt opt = NULL;
1053: PetscFunctionBegin;
1054: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1055: PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
1056: if (UnpackAndOp) {
1057: PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
1058: PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
1059: } else {
1060: PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
1061: PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1062: }
1063: }
1064: PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
1065: PetscFunctionReturn(PETSC_SUCCESS);
1066: }
1068: static PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1069: {
1070: const PetscInt *leafindices = NULL;
1071: PetscInt count, start;
1072: PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1073: PetscMemType leafmtype = link->leafmtype;
1074: PetscSFPackOpt opt = NULL;
1076: PetscFunctionBegin;
1077: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1078: PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1079: if (UnpackAndOp) {
1080: PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
1081: PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1082: } else {
1083: PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
1084: PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1085: }
1086: }
1087: PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
1088: PetscFunctionReturn(PETSC_SUCCESS);
1089: }
1090: /* Unpack rootbuf to rootdata, which are in the same memory space */
1091: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1092: {
1093: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1095: PetscFunctionBegin;
1096: PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1097: if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
1098: PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1099: if (scope == PETSCSF_REMOTE) {
1100: if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
1101: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1102: }
1103: PetscFunctionReturn(PETSC_SUCCESS);
1104: }
1106: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1107: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1108: {
1109: PetscFunctionBegin;
1110: PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1111: if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
1112: PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1113: if (scope == PETSCSF_REMOTE) {
1114: if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
1115: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1116: }
1117: PetscFunctionReturn(PETSC_SUCCESS);
1118: }
1120: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1121: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1122: {
1123: const PetscInt *rootindices = NULL;
1124: PetscInt count, start;
1125: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1126: PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1127: PetscMemType rootmtype = link->rootmtype;
1128: PetscSFPackOpt opt = NULL;
1130: PetscFunctionBegin;
1131: PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
1132: if (bas->rootbuflen[PETSCSF_REMOTE]) {
1133: /* Do FetchAndOp on rootdata with rootbuf */
1134: PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
1135: PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
1136: PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1137: }
1138: PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
1139: PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1140: PetscFunctionReturn(PETSC_SUCCESS);
1141: }
1143: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1144: {
1145: const PetscInt *rootindices = NULL, *leafindices = NULL;
1146: PetscInt count, rootstart, leafstart;
1147: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1148: PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
1149: PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1150: PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1151: PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL];
1152: char *srcbuf = NULL, *dstbuf = NULL;
1153: PetscBool dstdups;
1155: PetscFunctionBegin;
1156: if (!buflen) PetscFunctionReturn(PETSC_SUCCESS);
1157: if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1158: if (direction == PETSCSF_ROOT2LEAF) {
1159: PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
1160: srcmtype = rootmtype;
1161: srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1162: dstmtype = leafmtype;
1163: dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1164: } else {
1165: PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
1166: srcmtype = leafmtype;
1167: srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1168: dstmtype = rootmtype;
1169: dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1170: }
1171: PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
1172: /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1173: if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
1174: if (direction == PETSCSF_ROOT2LEAF) {
1175: PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1176: } else {
1177: PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
1178: }
1179: } else {
1180: dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1181: dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1182: PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1183: if (ScatterAndOp) {
1184: PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1185: PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1186: if (direction == PETSCSF_ROOT2LEAF) {
1187: PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1188: } else {
1189: PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
1190: }
1191: } else {
1192: PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1193: PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1194: if (direction == PETSCSF_ROOT2LEAF) {
1195: PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
1196: } else {
1197: PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1198: }
1199: }
1200: }
1201: PetscFunctionReturn(PETSC_SUCCESS);
1202: }
1204: /* Fetch rootdata to leafdata and leafupdate locally */
1205: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1206: {
1207: const PetscInt *rootindices = NULL, *leafindices = NULL;
1208: PetscInt count, rootstart, leafstart;
1209: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1210: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1211: const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1212: PetscSFPackOpt leafopt = NULL, rootopt = NULL;
1214: PetscFunctionBegin;
1215: if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS);
1216: if (rootmtype != leafmtype) {
1217: /* The local communication has to go through pack and unpack */
1218: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1219: } else {
1220: PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
1221: PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
1222: PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
1223: PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1224: }
1225: PetscFunctionReturn(PETSC_SUCCESS);
1226: }
1228: /*
1229: Create per-rank pack/unpack optimizations based on indices patterns
1231: Input Parameters:
1232: + n - Number of destination ranks
1233: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1234: - idx - [*] Array storing indices
1236: Output Parameters:
1237: + opt - Pack optimizations. NULL if no optimizations.
1238: */
1239: static PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1240: {
1241: PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1242: PetscBool optimizable = PETSC_TRUE;
1243: PetscSFPackOpt opt;
1245: PetscFunctionBegin;
1246: PetscCall(PetscMalloc1(1, &opt));
1247: PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1248: opt->n = opt->array[0] = n;
1249: opt->offset = opt->array + 1;
1250: opt->start = opt->array + n + 2;
1251: opt->dx = opt->array + 2 * n + 2;
1252: opt->dy = opt->array + 3 * n + 2;
1253: opt->dz = opt->array + 4 * n + 2;
1254: opt->X = opt->array + 5 * n + 2;
1255: opt->Y = opt->array + 6 * n + 2;
1257: for (r = 0; r < n; r++) { /* For each destination rank */
1258: m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1259: p = offset[r];
1260: start = idx[p]; /* First index for this rank */
1261: p++;
1263: /* Search in X dimension */
1264: for (dx = 1; dx < m; dx++, p++) {
1265: if (start + dx != idx[p]) break;
1266: }
1268: dydz = m / dx;
1269: X = dydz > 1 ? (idx[p] - start) : dx;
1270: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1271: if (m % dx || X <= 0) {
1272: optimizable = PETSC_FALSE;
1273: goto finish;
1274: }
1275: for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1276: for (i = 0; i < dx; i++, p++) {
1277: if (start + X * dy + i != idx[p]) {
1278: if (i) {
1279: optimizable = PETSC_FALSE;
1280: goto finish;
1281: } /* The pattern is violated in the middle of an x-walk */
1282: else
1283: goto Z_dimension;
1284: }
1285: }
1286: }
1288: Z_dimension:
1289: dz = m / (dx * dy);
1290: Y = dz > 1 ? (idx[p] - start) / X : dy;
1291: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1292: if (m % (dx * dy) || Y <= 0) {
1293: optimizable = PETSC_FALSE;
1294: goto finish;
1295: }
1296: for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1297: for (j = 0; j < dy; j++) {
1298: for (i = 0; i < dx; i++, p++) {
1299: if (start + X * Y * k + X * j + i != idx[p]) {
1300: optimizable = PETSC_FALSE;
1301: goto finish;
1302: }
1303: }
1304: }
1305: }
1306: opt->start[r] = start;
1307: opt->dx[r] = dx;
1308: opt->dy[r] = dy;
1309: opt->dz[r] = dz;
1310: opt->X[r] = X;
1311: opt->Y[r] = Y;
1312: }
1314: finish:
1315: /* If not optimizable, free arrays to save memory */
1316: if (!n || !optimizable) {
1317: PetscCall(PetscFree(opt->array));
1318: PetscCall(PetscFree(opt));
1319: *out = NULL;
1320: } else {
1321: opt->offset[0] = 0;
1322: for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1323: *out = opt;
1324: }
1325: PetscFunctionReturn(PETSC_SUCCESS);
1326: }
1328: static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1329: {
1330: PetscSFPackOpt opt = *out;
1332: PetscFunctionBegin;
1333: if (opt) {
1334: PetscCall(PetscSFFree(sf, mtype, opt->array));
1335: PetscCall(PetscFree(opt));
1336: *out = NULL;
1337: }
1338: PetscFunctionReturn(PETSC_SUCCESS);
1339: }
1341: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1342: {
1343: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1344: PetscInt i, j;
1346: PetscFunctionBegin;
1347: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1348: for (i = 0; i < 2; i++) { /* Set defaults */
1349: sf->leafstart[i] = 0;
1350: sf->leafcontig[i] = PETSC_TRUE;
1351: sf->leafdups[i] = PETSC_FALSE;
1352: bas->rootstart[i] = 0;
1353: bas->rootcontig[i] = PETSC_TRUE;
1354: bas->rootdups[i] = PETSC_FALSE;
1355: }
1357: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1358: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1360: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1361: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1363: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1364: for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
1365: if (sf->rmine[i] != sf->leafstart[0] + i) {
1366: sf->leafcontig[0] = PETSC_FALSE;
1367: break;
1368: }
1369: }
1370: for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
1371: if (sf->rmine[i] != sf->leafstart[1] + j) {
1372: sf->leafcontig[1] = PETSC_FALSE;
1373: break;
1374: }
1375: }
1377: /* If not, see if we can have per-rank optimizations by doing index analysis */
1378: if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
1379: if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1381: /* Are root indices for self and remote contiguous? */
1382: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1383: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1385: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1386: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1388: for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
1389: if (bas->irootloc[i] != bas->rootstart[0] + i) {
1390: bas->rootcontig[0] = PETSC_FALSE;
1391: break;
1392: }
1393: }
1394: for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
1395: if (bas->irootloc[i] != bas->rootstart[1] + j) {
1396: bas->rootcontig[1] = PETSC_FALSE;
1397: break;
1398: }
1399: }
1401: if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
1402: if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1404: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1405: if (PetscDefined(HAVE_DEVICE)) {
1406: PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1407: if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
1408: if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
1409: if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
1410: if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1411: }
1412: PetscFunctionReturn(PETSC_SUCCESS);
1413: }
1415: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1416: {
1417: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1418: PetscInt i;
1420: PetscFunctionBegin;
1421: for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
1422: PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
1423: PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
1424: #if defined(PETSC_HAVE_DEVICE)
1425: PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
1426: PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
1427: #endif
1428: }
1429: PetscFunctionReturn(PETSC_SUCCESS);
1430: }