Actual source code: sfmpi.c

  1: /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
  2:    we can also do abstractions like Prepare/StartCommunication.
  3: */

  5: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  7: /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
  8: static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
  9: {
 10:   PetscMPIInt    nreqs;
 11:   MPI_Request   *reqs = NULL;
 12:   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
 13:   PetscInt       buflen;

 15:   PetscFunctionBegin;
 16:   buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
 17:   if (buflen) {
 18:     if (direction == PETSCSF_ROOT2LEAF) {
 19:       nreqs = sf->nleafreqs;
 20:       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
 21:     } else { /* leaf to root */
 22:       nreqs = bas->nrootreqs;
 23:       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
 24:     }
 25:     PetscCallMPI(MPI_Startall_irecv(buflen, link->unit, nreqs, reqs));
 26:   }

 28:   buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
 29:   if (buflen) {
 30:     if (direction == PETSCSF_ROOT2LEAF) {
 31:       nreqs = bas->nrootreqs;
 32:       PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /*device2host before sending */));
 33:       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &reqs, NULL));
 34:     } else { /* leaf to root */
 35:       nreqs = sf->nleafreqs;
 36:       PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE));
 37:       PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, NULL, &reqs));
 38:     }
 39:     PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
 40:     PetscCallMPI(MPI_Startall_isend(buflen, link->unit, nreqs, reqs));
 41:   }
 42:   PetscFunctionReturn(PETSC_SUCCESS);
 43: }

 45: static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
 46: {
 47:   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
 48:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
 49:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

 51:   PetscFunctionBegin;
 52:   PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
 53:   PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
 54:   if (direction == PETSCSF_ROOT2LEAF) {
 55:     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
 56:   } else {
 57:     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
 58:   }
 59:   PetscFunctionReturn(PETSC_SUCCESS);
 60: }

 62: #if defined(PETSC_HAVE_MPIX_STREAM)
 63: // issue MPIX_Isend/Irecv_enqueue()
 64: static PetscErrorCode PetscSFLinkStartEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
 65: {
 66:   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
 67:   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
 68:   const PetscInt    *rootoffset, *leafoffset;
 69:   MPI_Aint           disp;
 70:   MPI_Comm           stream_comm   = sf->stream_comm;
 71:   MPI_Datatype       unit          = link->unit;
 72:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
 73:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

 75:   PetscFunctionBegin;
 76:   if (bas->rootbuflen[PETSCSF_REMOTE]) {
 77:     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
 78:     if (direction == PETSCSF_LEAF2ROOT) {
 79:       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
 80:         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
 81:         cnt  = rootoffset[i + 1] - rootoffset[i];
 82:         PetscCallMPI(MPIX_Irecv_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
 83:       }
 84:     } else { // PETSCSF_ROOT2LEAF
 85:       for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
 86:         disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
 87:         cnt  = rootoffset[i + 1] - rootoffset[i];
 88:         // no need to sync the gpu stream!
 89:         PetscCallMPI(MPIX_Isend_enqueue(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, stream_comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
 90:       }
 91:     }
 92:   }

 94:   if (sf->leafbuflen[PETSCSF_REMOTE]) {
 95:     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
 96:     if (direction == PETSCSF_LEAF2ROOT) {
 97:       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
 98:         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
 99:         cnt  = leafoffset[i + 1] - leafoffset[i];
100:         // no need to sync the gpu stream!
101:         PetscCallMPI(MPIX_Isend_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
102:       }
103:     } else { // PETSCSF_ROOT2LEAF
104:       for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
105:         disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
106:         cnt  = leafoffset[i + 1] - leafoffset[i];
107:         PetscCallMPI(MPIX_Irecv_enqueue(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, stream_comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
108:       }
109:     }
110:   }
111:   PetscFunctionReturn(PETSC_SUCCESS);
112: }

114: static PetscErrorCode PetscSFLinkWaitEnqueue_MPIX_Stream(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
115: {
116:   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
117:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
118:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

120:   PetscFunctionBegin;
121:   PetscCallMPI(MPIX_Waitall_enqueue(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
122:   PetscCallMPI(MPIX_Waitall_enqueue(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
123:   PetscFunctionReturn(PETSC_SUCCESS);
124: }
125: #endif

127: /*
128:    The routine Creates a communication link for the given operation. It first looks up its link cache. If
129:    there is a free & suitable one, it uses it. Otherwise it creates a new one.

131:    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
132:    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
133:    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
134:    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.

136:    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.

138:    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.

140:    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
141:    need pack/unpack data.
142: */
143: PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
144: {
145:   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
146:   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
147:   PetscSFLink     *p, link;
148:   PetscSFDirection direction;
149:   MPI_Request     *reqs = NULL;
150:   PetscBool        match, rootdirect[2], leafdirect[2];
151:   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
152:   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
153:   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
154:   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/

156:   PetscFunctionBegin;

158:   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
159:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
160:     if (sfop == PETSCSF_BCAST) {
161:       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
162:       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
163:     } else if (sfop == PETSCSF_REDUCE) {
164:       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
165:       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
166:     } else {                                                                                /* PETSCSF_FETCH */
167:       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
168:       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
169:     }
170:   }

172:   if (sf->use_gpu_aware_mpi) {
173:     rootmtype_mpi = rootmtype;
174:     leafmtype_mpi = leafmtype;
175:   } else {
176:     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
177:   }
178:   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
179:   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
180:   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;

182:   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
183:   nrootreqs = bas->nrootreqs;
184:   nleafreqs = sf->nleafreqs;

186:   /* Look for free links in cache */
187:   for (p = &bas->avail; (link = *p); p = &link->next) {
188:     if (!link->use_nvshmem) { /* Only check with MPI links */
189:       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
190:       if (match) {
191:         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
192:            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
193:         */
194:         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
195:           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
196:           for (i = 0; i < nrootreqs; i++) {
197:             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
198:           }
199:           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
200:         }
201:         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
202:           reqs = link->leafreqs[direction][leafmtype][1];
203:           for (i = 0; i < nleafreqs; i++) {
204:             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
205:           }
206:           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
207:         }
208:         *p = link->next; /* Remove from available list */
209:         goto found;
210:       }
211:     }
212:   }

214:   PetscCall(PetscNew(&link));
215:   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
216:   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */

218:   nreqs = (nrootreqs + nleafreqs) * 8;
219:   PetscCall(PetscMalloc1(nreqs, &link->reqs));
220:   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */

222:   if (nreqs)
223:     for (i = 0; i < 2; i++) {     /* Two communication directions */
224:       for (j = 0; j < 2; j++) {   /* Two memory types */
225:         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
226:           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
227:           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
228:         }
229:       }
230:     }
231:   link->StartCommunication  = PetscSFLinkStartRequests_MPI;
232:   link->FinishCommunication = PetscSFLinkWaitRequests_MPI;
233: #if defined(PETSC_HAVE_MPIX_STREAM)
234:   if (sf->use_stream_aware_mpi && (PetscMemTypeDevice(rootmtype_mpi) || PetscMemTypeDevice(leafmtype_mpi))) {
235:     link->StartCommunication  = PetscSFLinkStartEnqueue_MPIX_Stream;
236:     link->FinishCommunication = PetscSFLinkWaitEnqueue_MPIX_Stream;
237:   }
238: #endif

240: found:

242: #if defined(PETSC_HAVE_DEVICE)
243:   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
244:   #if defined(PETSC_HAVE_CUDA)
245:     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
246:   #endif
247:   #if defined(PETSC_HAVE_HIP)
248:     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
249:   #endif
250:   #if defined(PETSC_HAVE_KOKKOS)
251:     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
252:   #endif
253:   }
254: #endif

256:   /* Allocate buffers along root/leafdata */
257:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
258:     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
259:     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
260:     if (bas->rootbuflen[i]) {
261:       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
262:         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
263:       } else { /* Have to have a separate rootbuf */
264:         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
265:         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
266:       }
267:     }

269:     if (sf->leafbuflen[i]) {
270:       if (leafdirect[i]) {
271:         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
272:       } else {
273:         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
274:         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
275:       }
276:     }
277:   }

279: #if defined(PETSC_HAVE_DEVICE)
280:   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
281:   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
282:     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
283:     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
284:   }
285:   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
286:     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
287:     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
288:   }
289: #endif

291:   /* Set `current` state of the link. They may change between different SF invocations with the same link */
292:   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
293:     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
294:     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
295:   }

297:   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
298:   link->leafdata = leafdata;
299:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
300:     link->rootdirect[i] = rootdirect[i];
301:     link->leafdirect[i] = leafdirect[i];
302:   }
303:   link->rootdirect_mpi = rootdirect_mpi;
304:   link->leafdirect_mpi = leafdirect_mpi;
305:   link->rootmtype      = rootmtype;
306:   link->leafmtype      = leafmtype;
307:   link->rootmtype_mpi  = rootmtype_mpi;
308:   link->leafmtype_mpi  = leafmtype_mpi;

310:   link->next = bas->inuse;
311:   bas->inuse = link;
312:   *mylink    = link;
313:   PetscFunctionReturn(PETSC_SUCCESS);
314: }