Actual source code: sfmpi.c

petsc-3.15.0 2021-03-30
Report Typos and Errors
  1: /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
  2:    we can also do abstractions like Prepare/StartCommunication.
  3: */

  5: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  7: /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
  8: static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
  9: {
 10:   PetscErrorCode    ierr;
 11:   PetscMPIInt       nreqs;
 12:   MPI_Request       *reqs = NULL;
 13:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
 14:   PetscInt          buflen;

 17:   buflen = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
 18:   if (buflen) {
 19:     if (direction == PETSCSF_../../../../../..2LEAF) {
 20:       nreqs = sf->nleafreqs;
 21:       PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);
 22:     } else { /* leaf to root */
 23:       nreqs = bas->nrootreqs;
 24:       PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);
 25:     }
 26:     MPI_Startall_irecv(buflen,link->unit,nreqs,reqs);
 27:   }

 29:   buflen = (direction == PETSCSF_../../../../../..2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
 30:   if (buflen) {
 31:     if (direction == PETSCSF_../../../../../..2LEAF) {
 32:       nreqs  = bas->nrootreqs;
 33:       PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host before sending */);
 34:       PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);
 35:     } else { /* leaf to root */
 36:       nreqs  = sf->nleafreqs;
 37:       PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);
 38:       PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);
 39:     }
 40:     PetscSFLinkSyncStreamBeforeCallMPI(sf,link,direction);
 41:     MPI_Startall_isend(buflen,link->unit,nreqs,reqs);
 42:   }
 43:   return(0);
 44: }

 46: static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
 47: {
 48:   PetscErrorCode       ierr;
 49:   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
 50:   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
 51:   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;

 54:   MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);
 55:   MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);
 56:   if (direction == PETSCSF_../../../../../..2LEAF) {
 57:     PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE/* host2device after recving */);
 58:   } else {
 59:     PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);
 60:   }
 61:   return(0);
 62: }

 64: /*
 65:    The routine Creates a communication link for the given operation. It first looks up its link cache. If
 66:    there is a free & suitable one, it uses it. Otherwise it creates a new one.

 68:    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
 69:    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
 70:    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
 71:    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.

 73:    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.

 75:    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.

 77:    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
 78:    need pack/unpack data.
 79: */
 80: PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
 81: {
 82:   PetscErrorCode    ierr;
 83:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
 84:   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
 85:   PetscSFLink       *p,link;
 86:   PetscSFDirection  direction;
 87:   MPI_Request       *reqs = NULL;
 88:   PetscBool         match,rootdirect[2],leafdirect[2];
 89:   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
 90:   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
 91:   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
 92:   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/


 96:   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
 97:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
 98:     if (sfop == PETSCSF_BCAST) {
 99:       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
100:       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
101:     } else if (sfop == PETSCSF_REDUCE) {
102:       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
103:       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
104:     } else { /* PETSCSF_FETCH */
105:       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
106:       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
107:     }
108:   }

110:   if (sf->use_gpu_aware_mpi) {
111:     rootmtype_mpi = rootmtype;
112:     leafmtype_mpi = leafmtype;
113:   } else {
114:     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
115:   }
116:   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
117:   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
118:   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;

120:   direction = (sfop == PETSCSF_BCAST)? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
121:   nrootreqs = bas->nrootreqs;
122:   nleafreqs = sf->nleafreqs;

124:   /* Look for free links in cache */
125:   for (p=&bas->avail; (link=*p); p=&link->next) {
126:     if (!link->use_nvshmem) { /* Only check with MPI links */
127:       MPIPetsc_Type_compare(unit,link->unit,&match);
128:       if (match) {
129:         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
130:            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
131:         */
132:         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
133:           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
134:           for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
135:           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
136:         }
137:         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
138:           reqs = link->leafreqs[direction][leafmtype][1];
139:           for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
140:           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
141:         }
142:         *p = link->next; /* Remove from available list */
143:         goto found;
144:       }
145:     }
146:   }

148:   PetscNew(&link);
149:   PetscSFLinkSetUp_Host(sf,link,unit);
150:   PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag); /* One tag per link */

152:   nreqs = (nrootreqs+nleafreqs)*8;
153:   PetscMalloc1(nreqs,&link->reqs);
154:   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */

156:   for (i=0; i<2; i++) { /* Two communication directions */
157:     for (j=0; j<2; j++) { /* Two memory types */
158:       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
159:         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
160:         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
161:       }
162:     }
163:   }
164:   link->StartCommunication    = PetscSFLinkStartRequests_MPI;
165:   link->FinishCommunication   = PetscSFLinkWaitRequests_MPI;

167: found:

169: #if defined(PETSC_HAVE_DEVICE)
170:   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
171:     #if defined(PETSC_HAVE_CUDA)
172:       if (sf->backend == PETSCSF_BACKEND_CUDA)   {PetscSFLinkSetUp_CUDA(sf,link,unit);} /* Setup streams etc */
173:     #endif
174:     #if defined(PETSC_HAVE_HIP)
175:       if (sf->backend == PETSCSF_BACKEND_HIP)    {PetscSFLinkSetUp_HIP(sf,link,unit);} /* Setup streams etc */
176:     #endif
177:     #if defined(PETSC_HAVE_KOKKOS)
178:       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {PetscSFLinkSetUp_Kokkos(sf,link,unit);}
179:     #endif
180:   }
181: #endif

183:   /* Allocate buffers along root/leafdata */
184:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
185:     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
186:     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
187:     if (bas->rootbuflen[i]) {
188:       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
189:         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
190:       } else { /* Have to have a separate rootbuf */
191:         if (!link->rootbuf_alloc[i][rootmtype]) {
192:           PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);
193:         }
194:         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
195:       }
196:     }

198:     if (sf->leafbuflen[i]) {
199:       if (leafdirect[i]) {
200:         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
201:       } else {
202:         if (!link->leafbuf_alloc[i][leafmtype]) {
203:           PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);
204:         }
205:         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
206:       }
207:     }
208:   }

210: #if defined(PETSC_HAVE_DEVICE)
211:   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
212:   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
213:     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
214:       PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
215:     }
216:     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
217:   }
218:   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
219:     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
220:       PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
221:     }
222:     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
223:   }
224: #endif

226:   /* Set `current` state of the link. They may change between different SF invocations with the same link */
227:   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
228:     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
229:     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
230:   }

232:   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
233:   link->leafdata = leafdata;
234:   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
235:     link->rootdirect[i] = rootdirect[i];
236:     link->leafdirect[i] = leafdirect[i];
237:   }
238:   link->rootdirect_mpi  = rootdirect_mpi;
239:   link->leafdirect_mpi  = leafdirect_mpi;
240:   link->rootmtype       = rootmtype;
241:   link->leafmtype       = leafmtype;
242:   link->rootmtype_mpi   = rootmtype_mpi;
243:   link->leafmtype_mpi   = leafmtype_mpi;

245:   link->next            = bas->inuse;
246:   bas->inuse            = link;
247:   *mylink               = link;
248:   return(0);
249: }