Actual source code: sfnvshmem.cu

petsc-3.15.0 2021-03-30
Report Typos and Errors
  1: #include <petsc/private/cudavecimpl.h>
  2: #include <petsccublas.h>
  3: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  4: #include <mpi.h>
  5: #include <nvshmem.h>
  6: #include <nvshmemx.h>

  8: PetscErrorCode PetscNvshmemInitializeCheck(void)
  9: {
 10:   PetscErrorCode   ierr;

 13:   if (!PetscNvshmemInitialized) { /* Note NVSHMEM does not provide a routine to check whether it is initialized */
 14:     nvshmemx_init_attr_t attr;
 15:     attr.mpi_comm = &PETSC_COMM_WORLD;
 16:     PetscCUDAInitializeCheck();
 17:     nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM,&attr);
 18:     PetscNvshmemInitialized = PETSC_TRUE;
 19:     PetscBeganNvshmem       = PETSC_TRUE;
 20:   }
 21:   return(0);
 22: }

 24: PetscErrorCode PetscNvshmemMalloc(size_t size, void** ptr)
 25: {

 29:   PetscNvshmemInitializeCheck();
 30:   *ptr = nvshmem_malloc(size);
 31:   if (!*ptr) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"nvshmem_malloc() failed to allocate %zu bytes",size);
 32:   return(0);
 33: }

 35: PetscErrorCode PetscNvshmemCalloc(size_t size, void**ptr)
 36: {

 40:   PetscNvshmemInitializeCheck();
 41:   *ptr = nvshmem_calloc(size,1);
 42:   if (!*ptr) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"nvshmem_calloc() failed to allocate %zu bytes",size);
 43:   return(0);
 44: }

 46: PetscErrorCode PetscNvshmemFree_Private(void* ptr)
 47: {
 49:   nvshmem_free(ptr);
 50:   return(0);
 51: }

 53: PetscErrorCode PetscNvshmemFinalize(void)
 54: {
 56:   nvshmem_finalize();
 57:   return(0);
 58: }

 60: /* Free nvshmem related fields in the SF */
 61: PetscErrorCode PetscSFReset_Basic_NVSHMEM(PetscSF sf)
 62: {
 63:   PetscErrorCode    ierr;
 64:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;

 67:   PetscFree2(bas->leafsigdisp,bas->leafbufdisp);
 68:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->leafbufdisp_d);
 69:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->leafsigdisp_d);
 70:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->iranks_d);
 71:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->ioffset_d);

 73:   PetscFree2(sf->rootsigdisp,sf->rootbufdisp);
 74:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->rootbufdisp_d);
 75:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->rootsigdisp_d);
 76:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->ranks_d);
 77:   PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->roffset_d);
 78:   return(0);
 79: }

 81: /* Set up NVSHMEM related fields for an SF of type SFBASIC (only after PetscSFSetup_Basic() already set up dependant fields */
 82: static PetscErrorCode PetscSFSetUp_Basic_NVSHMEM(PetscSF sf)
 83: {
 85:   cudaError_t    cerr;
 86:   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
 87:   PetscInt       i,nRemoteRootRanks,nRemoteLeafRanks;
 88:   PetscMPIInt    tag;
 89:   MPI_Comm       comm;
 90:   MPI_Request    *rootreqs,*leafreqs;
 91:   PetscInt       tmp,stmp[4],rtmp[4]; /* tmps for send/recv buffers */

 94:   PetscObjectGetComm((PetscObject)sf,&comm);
 95:   PetscObjectGetNewTag((PetscObject)sf,&tag);

 97:   nRemoteRootRanks      = sf->nranks-sf->ndranks;
 98:   nRemoteLeafRanks      = bas->niranks-bas->ndiranks;
 99:   sf->nRemoteRootRanks  = nRemoteRootRanks;
100:   bas->nRemoteLeafRanks = nRemoteLeafRanks;

102:   PetscMalloc2(nRemoteLeafRanks,&rootreqs,nRemoteRootRanks,&leafreqs);

104:   stmp[0] = nRemoteRootRanks;
105:   stmp[1] = sf->leafbuflen[PETSCSF_REMOTE];
106:   stmp[2] = nRemoteLeafRanks;
107:   stmp[3] = bas->rootbuflen[PETSCSF_REMOTE];

109:   MPIU_Allreduce(stmp,rtmp,4,MPIU_INT,MPI_MAX,comm);

111:   sf->nRemoteRootRanksMax   = rtmp[0];
112:   sf->leafbuflen_rmax       = rtmp[1];
113:   bas->nRemoteLeafRanksMax  = rtmp[2];
114:   bas->rootbuflen_rmax      = rtmp[3];

116:   /* Total four rounds of MPI communications to set up the nvshmem fields */

118:   /* Root ranks to leaf ranks: send info about rootsigdisp[] and rootbufdisp[] */
119:   PetscMalloc2(nRemoteRootRanks,&sf->rootsigdisp,nRemoteRootRanks,&sf->rootbufdisp);
120:   for (i=0; i<nRemoteRootRanks; i++) {MPI_Irecv(&sf->rootsigdisp[i],1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm,&leafreqs[i]);} /* Leaves recv */
121:   for (i=0; i<nRemoteLeafRanks; i++) {MPI_Send(&i,1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm);} /* Roots send. Note i changes, so we use MPI_Send. */
122:   MPI_Waitall(nRemoteRootRanks,leafreqs,MPI_STATUSES_IGNORE);

124:   for (i=0; i<nRemoteRootRanks; i++) {MPI_Irecv(&sf->rootbufdisp[i],1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm,&leafreqs[i]);} /* Leaves recv */
125:   for (i=0; i<nRemoteLeafRanks; i++) {
126:     tmp  = bas->ioffset[i+bas->ndiranks] - bas->ioffset[bas->ndiranks];
127:     MPI_Send(&tmp,1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm);  /* Roots send. Note tmp changes, so we use MPI_Send. */
128:   }
129:   MPI_Waitall(nRemoteRootRanks,leafreqs,MPI_STATUSES_IGNORE);

131:   cerr = cudaMalloc((void**)&sf->rootbufdisp_d,nRemoteRootRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
132:   cerr = cudaMalloc((void**)&sf->rootsigdisp_d,nRemoteRootRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
133:   cerr = cudaMalloc((void**)&sf->ranks_d,nRemoteRootRanks*sizeof(PetscMPIInt));CHKERRCUDA(cerr);
134:   cerr = cudaMalloc((void**)&sf->roffset_d,(nRemoteRootRanks+1)*sizeof(PetscInt));CHKERRCUDA(cerr);

136:   cerr = cudaMemcpyAsync(sf->rootbufdisp_d,sf->rootbufdisp,nRemoteRootRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
137:   cerr = cudaMemcpyAsync(sf->rootsigdisp_d,sf->rootsigdisp,nRemoteRootRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
138:   cerr = cudaMemcpyAsync(sf->ranks_d,sf->ranks+sf->ndranks,nRemoteRootRanks*sizeof(PetscMPIInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
139:   cerr = cudaMemcpyAsync(sf->roffset_d,sf->roffset+sf->ndranks,(nRemoteRootRanks+1)*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);

141:   /* Leaf ranks to root ranks: send info about leafsigdisp[] and leafbufdisp[] */
142:   PetscMalloc2(nRemoteLeafRanks,&bas->leafsigdisp,nRemoteLeafRanks,&bas->leafbufdisp);
143:   for (i=0; i<nRemoteLeafRanks; i++) {MPI_Irecv(&bas->leafsigdisp[i],1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm,&rootreqs[i]);}
144:   for (i=0; i<nRemoteRootRanks; i++) {MPI_Send(&i,1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm);}
145:   MPI_Waitall(nRemoteLeafRanks,rootreqs,MPI_STATUSES_IGNORE);

147:   for (i=0; i<nRemoteLeafRanks; i++) {MPI_Irecv(&bas->leafbufdisp[i],1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm,&rootreqs[i]);}
148:   for (i=0; i<nRemoteRootRanks; i++) {
149:     tmp  = sf->roffset[i+sf->ndranks] - sf->roffset[sf->ndranks];
150:     MPI_Send(&tmp,1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm);
151:   }
152:   MPI_Waitall(nRemoteLeafRanks,rootreqs,MPI_STATUSES_IGNORE);

154:   cerr = cudaMalloc((void**)&bas->leafbufdisp_d,nRemoteLeafRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
155:   cerr = cudaMalloc((void**)&bas->leafsigdisp_d,nRemoteLeafRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
156:   cerr = cudaMalloc((void**)&bas->iranks_d,nRemoteLeafRanks*sizeof(PetscMPIInt));CHKERRCUDA(cerr);
157:   cerr = cudaMalloc((void**)&bas->ioffset_d,(nRemoteLeafRanks+1)*sizeof(PetscInt));CHKERRCUDA(cerr);

159:   cerr = cudaMemcpyAsync(bas->leafbufdisp_d,bas->leafbufdisp,nRemoteLeafRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
160:   cerr = cudaMemcpyAsync(bas->leafsigdisp_d,bas->leafsigdisp,nRemoteLeafRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
161:   cerr = cudaMemcpyAsync(bas->iranks_d,bas->iranks+bas->ndiranks,nRemoteLeafRanks*sizeof(PetscMPIInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
162:   cerr = cudaMemcpyAsync(bas->ioffset_d,bas->ioffset+bas->ndiranks,(nRemoteLeafRanks+1)*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);

164:   PetscFree2(rootreqs,leafreqs);
165:   return(0);
166: }

168: PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF sf,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,PetscBool *use_nvshmem)
169: {
170:   PetscErrorCode   ierr;
171:   MPI_Comm         comm;
172:   PetscBool        isBasic;
173:   PetscMPIInt      result = MPI_UNEQUAL;

176:   PetscObjectGetComm((PetscObject)sf,&comm);
177:   /* Check if the sf is eligible for NVSHMEM, if we have not checked yet.
178:      Note the check result <use_nvshmem> must be the same over comm, since an SFLink must be collectively either NVSHMEM or MPI.
179:   */
180:   sf->checked_nvshmem_eligibility = PETSC_TRUE;
181:   if (sf->use_nvshmem && !sf->checked_nvshmem_eligibility) {
182:     /* Only use NVSHMEM for SFBASIC on PETSC_COMM_WORLD  */
183:     PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isBasic);
184:     if (isBasic) {MPI_Comm_compare(PETSC_COMM_WORLD,comm,&result);}
185:     if (!isBasic || (result != MPI_IDENT && result != MPI_CONGRUENT)) sf->use_nvshmem = PETSC_FALSE; /* If not eligible, clear the flag so that we don't try again */

187:     /* Do further check: If on a rank, both rootdata and leafdata are NULL, we might think they are PETSC_MEMTYPE_CUDA (or HOST)
188:        and then use NVSHMEM. But if root/leafmtypes on other ranks are PETSC_MEMTYPE_HOST (or DEVICE), this would lead to
189:        inconsistency on the return value <use_nvshmem>. To be safe, we simply disable nvshmem on these rare SFs.
190:     */
191:     if (sf->use_nvshmem) {
192:       PetscInt hasNullRank = (!rootdata && !leafdata) ? 1 : 0;
193:       MPI_Allreduce(MPI_IN_PLACE,&hasNullRank,1,MPIU_INT,MPI_LOR,comm);
194:       if (hasNullRank) sf->use_nvshmem = PETSC_FALSE;
195:     }
196:     sf->checked_nvshmem_eligibility = PETSC_TRUE; /* If eligible, don't do above check again */
197:   }

199:   /* Check if rootmtype and leafmtype collectively are PETSC_MEMTYPE_CUDA */
200:   if (sf->use_nvshmem) {
201:     PetscInt oneCuda = (!rootdata || PetscMemTypeCUDA(rootmtype)) && (!leafdata || PetscMemTypeCUDA(leafmtype)) ? 1 : 0; /* Do I use cuda for both root&leafmtype? */
202:     PetscInt allCuda = oneCuda; /* Assume the same for all ranks. But if not, in opt mode, return value <use_nvshmem> won't be collective! */
203:    #if defined(PETSC_USE_DEBUG)  /* Check in debug mode. Note MPI_Allreduce is expensive, so only in debug mode */
204:     MPI_Allreduce(&oneCuda,&allCuda,1,MPIU_INT,MPI_LAND,comm);
205:     if (allCuda != oneCuda) SETERRQ(comm,PETSC_ERR_SUP,"root/leaf mtypes are inconsistent among ranks, which may lead to SF nvshmem failure in opt mode. Add -use_nvshmem 0 to disable it.");
206:    #endif
207:     if (allCuda) {
208:       PetscNvshmemInitializeCheck();
209:       if (!sf->setup_nvshmem) { /* Set up nvshmem related fields on this SF on-demand */
210:         PetscSFSetUp_Basic_NVSHMEM(sf);
211:         sf->setup_nvshmem = PETSC_TRUE;
212:       }
213:       *use_nvshmem = PETSC_TRUE;
214:     } else {
215:       *use_nvshmem = PETSC_FALSE;
216:     }
217:   } else {
218:     *use_nvshmem = PETSC_FALSE;
219:   }
220:   return(0);
221: }

223: /* Build dependence between <stream> and <remoteCommStream> at the entry of NVSHMEM communication */
224: static PetscErrorCode PetscSFLinkBuildDependenceBegin(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
225: {
226:   cudaError_t    cerr;
227:   PetscSF_Basic  *bas = (PetscSF_Basic *)sf->data;
228:   PetscInt       buflen = (direction == PETSCSF_../../../../../..2LEAF)? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];

231:   if (buflen) {
232:     cerr = cudaEventRecord(link->dataReady,link->stream);CHKERRCUDA(cerr);
233:     cerr = cudaStreamWaitEvent(link->remoteCommStream,link->dataReady,0);CHKERRCUDA(cerr);
234:   }
235:   return(0);
236: }

238: /* Build dependence between <stream> and <remoteCommStream> at the exit of NVSHMEM communication */
239: static PetscErrorCode PetscSFLinkBuildDependenceEnd(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
240: {
241:   cudaError_t    cerr;
242:   PetscSF_Basic  *bas = (PetscSF_Basic *)sf->data;
243:   PetscInt       buflen = (direction == PETSCSF_../../../../../..2LEAF)? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];

246:   /* If unpack to non-null device buffer, build the endRemoteComm dependance */
247:   if (buflen) {
248:     cerr = cudaEventRecord(link->endRemoteComm,link->remoteCommStream);CHKERRCUDA(cerr);
249:     cerr = cudaStreamWaitEvent(link->stream,link->endRemoteComm,0);CHKERRCUDA(cerr);
250:   }
251:   return(0);
252: }

254: /* Send/Put signals to remote ranks

256:  Input parameters:
257:   + n        - Number of remote ranks
258:   . sig      - Signal address in symmetric heap
259:   . sigdisp  - To i-th rank, use its signal at offset sigdisp[i]
260:   . ranks    - remote ranks
261:   - newval   - Set signals to this value
262: */
263: __global__ static void NvshmemSendSignals(PetscInt n,uint64_t *sig,PetscInt *sigdisp,PetscMPIInt *ranks,uint64_t newval)
264: {
265:   int i = blockIdx.x*blockDim.x + threadIdx.x;

267:   /* Each thread puts one remote signal */
268:   if (i < n) nvshmemx_uint64_signal(sig+sigdisp[i],newval,ranks[i]);
269: }

271: /* Wait until local signals equal to the expected value and then set them to a new value

273:  Input parameters:
274:   + n        - Number of signals
275:   . sig      - Local signal address
276:   . expval   - expected value
277:   - newval   - Set signals to this new value
278: */
279: __global__ static void NvshmemWaitSignals(PetscInt n,uint64_t *sig,uint64_t expval,uint64_t newval)
280: {
281: #if 0
282:   /* Akhil Langer@NVIDIA said using 1 thread and nvshmem_uint64_wait_until_all is better */
283:   int i = blockIdx.x*blockDim.x + threadIdx.x;
284:   if (i < n) {
285:     nvshmem_signal_wait_until(sig+i,NVSHMEM_CMP_EQ,expval);
286:     sig[i] = newval;
287:   }
288: #else
289:   nvshmem_uint64_wait_until_all(sig,n,NULL/*no mask*/,NVSHMEM_CMP_EQ,expval);
290:   for (int i=0; i<n; i++) sig[i] = newval;
291: #endif
292: }

294: /* ===========================================================================================================

296:    A set of routines to support receiver initiated communication using the get method

298:     The getting protocol is:

300:     Sender has a send buf (sbuf) and a signal variable (ssig);  Receiver has a recv buf (rbuf) and a signal variable (rsig);
301:     All signal variables have an initial value 0.

303:     Sender:                                 |  Receiver:
304:   1.  Wait ssig be 0, then set it to 1
305:   2.  Pack data into stand alone sbuf       |
306:   3.  Put 1 to receiver's rsig              |   1. Wait rsig to be 1, then set it 0
307:                                             |   2. Get data from remote sbuf to local rbuf
308:                                             |   3. Put 1 to sender's ssig
309:                                             |   4. Unpack data from local rbuf
310:    ===========================================================================================================*/
311: /* PrePack operation -- since sender will overwrite the send buffer which the receiver might be getting data from.
312:    Sender waits for signals (from receivers) indicating receivers have finished getting data
313: */
314: PetscErrorCode PetscSFLinkWaitSignalsOfCompletionOfGettingData_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
315: {
316:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
317:   uint64_t          *sig;
318:   PetscInt          n;

321:   if (direction == PETSCSF_../../../../../..2LEAF) { /* leaf ranks are getting data */
322:     sig = link->rootSendSig;            /* leaf ranks set my rootSendsig */
323:     n   = bas->nRemoteLeafRanks;
324:   } else { /* LEAF2../../../../../.. */
325:     sig = link->leafSendSig;
326:     n   = sf->nRemoteRootRanks;
327:   }

329:   if (n) {
330:     NvshmemWaitSignals<<<1,1,0,link->remoteCommStream>>>(n,sig,0,1); /* wait the signals to be 0, then set them to 1 */
331:     cudaError_t cerr = cudaGetLastError();CHKERRCUDA(cerr);
332:   }
333:   return(0);
334: }

336: /* n thread blocks. Each takes in charge one remote rank */
337: __global__ static void GetDataFromRemotelyAccessible(PetscInt nsrcranks,PetscMPIInt *srcranks,const char *src,PetscInt *srcdisp,char *dst,PetscInt *dstdisp,PetscInt unitbytes)
338: {
339:   int               bid = blockIdx.x;
340:   PetscMPIInt       pe  = srcranks[bid];

342:   if (!nvshmem_ptr(src,pe)) {
343:     PetscInt nelems = (dstdisp[bid+1]-dstdisp[bid])*unitbytes;
344:     nvshmem_getmem_nbi(dst+(dstdisp[bid]-dstdisp[0])*unitbytes,src+srcdisp[bid]*unitbytes,nelems,pe);
345:   }
346: }

348: /* Start communication -- Get data in the given direction */
349: PetscErrorCode PetscSFLinkGetDataBegin_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
350: {
351:   PetscErrorCode    ierr;
352:   cudaError_t       cerr;
353:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;

355:   PetscInt          nsrcranks,ndstranks,nLocallyAccessible = 0;

357:   char              *src,*dst;
358:   PetscInt          *srcdisp_h,*dstdisp_h;
359:   PetscInt          *srcdisp_d,*dstdisp_d;
360:   PetscMPIInt       *srcranks_h;
361:   PetscMPIInt       *srcranks_d,*dstranks_d;
362:   uint64_t          *dstsig;
363:   PetscInt          *dstsigdisp_d;

366:   PetscSFLinkBuildDependenceBegin(sf,link,direction);
367:   if (direction == PETSCSF_../../../../../..2LEAF) { /* src is root, dst is leaf; we will move data from src to dst */
368:     nsrcranks    = sf->nRemoteRootRanks;
369:     src          = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* root buf is the send buf; it is in symmetric heap */

371:     srcdisp_h    = sf->rootbufdisp;       /* for my i-th remote root rank, I will access its buf at offset rootbufdisp[i] */
372:     srcdisp_d    = sf->rootbufdisp_d;
373:     srcranks_h   = sf->ranks+sf->ndranks; /* my (remote) root ranks */
374:     srcranks_d   = sf->ranks_d;

376:     ndstranks    = bas->nRemoteLeafRanks;
377:     dst          = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* recv buf is the local leaf buf, also in symmetric heap */

379:     dstdisp_h    = sf->roffset+sf->ndranks; /* offsets of the local leaf buf. Note dstdisp[0] is not necessarily 0 */
380:     dstdisp_d    = sf->roffset_d;
381:     dstranks_d   = bas->iranks_d; /* my (remote) leaf ranks */

383:     dstsig       = link->leafRecvSig;
384:     dstsigdisp_d = bas->leafsigdisp_d;
385:   } else { /* src is leaf, dst is root; we will move data from src to dst */
386:     nsrcranks    = bas->nRemoteLeafRanks;
387:     src          = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* leaf buf is the send buf */

389:     srcdisp_h    = bas->leafbufdisp;       /* for my i-th remote root rank, I will access its buf at offset rootbufdisp[i] */
390:     srcdisp_d    = bas->leafbufdisp_d;
391:     srcranks_h   = bas->iranks+bas->ndiranks; /* my (remote) root ranks */
392:     srcranks_d   = bas->iranks_d;

394:     ndstranks    = sf->nRemoteRootRanks;
395:     dst          = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* the local root buf is the recv buf */

397:     dstdisp_h    = bas->ioffset+bas->ndiranks; /* offsets of the local root buf. Note dstdisp[0] is not necessarily 0 */
398:     dstdisp_d    = bas->ioffset_d;
399:     dstranks_d   = sf->ranks_d; /* my (remote) root ranks */

401:     dstsig       = link->rootRecvSig;
402:     dstsigdisp_d = sf->rootsigdisp_d;
403:   }

405:   /* After Pack operation -- src tells dst ranks that they are allowed to get data */
406:   if (ndstranks) {
407:     NvshmemSendSignals<<<(ndstranks+255)/256,256,0,link->remoteCommStream>>>(ndstranks,dstsig,dstsigdisp_d,dstranks_d,1); /* set signals to 1 */
408:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
409:   }

411:   /* dst waits for signals (permissions) from src ranks to start getting data */
412:   if (nsrcranks) {
413:     NvshmemWaitSignals<<<1,1,0,link->remoteCommStream>>>(nsrcranks,dstsig,1,0); /* wait the signals to be 1, then set them to 0 */
414:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
415:   }

417:   /* dst gets data from src ranks using non-blocking nvshmem_gets, which are finished in PetscSFLinkGetDataEnd_NVSHMEM() */

419:   /* Count number of locally accessible src ranks, which should be a small number */
420:   for (int i=0; i<nsrcranks; i++) {if (nvshmem_ptr(src,srcranks_h[i])) nLocallyAccessible++;}

422:   /* Get data from remotely accessible PEs */
423:   if (nLocallyAccessible < nsrcranks) {
424:     GetDataFromRemotelyAccessible<<<nsrcranks,1,0,link->remoteCommStream>>>(nsrcranks,srcranks_d,src,srcdisp_d,dst,dstdisp_d,link->unitbytes);
425:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
426:   }

428:   /* Get data from locally accessible PEs */
429:   if (nLocallyAccessible) {
430:     for (int i=0; i<nsrcranks; i++) {
431:       int pe = srcranks_h[i];
432:       if (nvshmem_ptr(src,pe)) {
433:         size_t nelems = (dstdisp_h[i+1]-dstdisp_h[i])*link->unitbytes;
434:         nvshmemx_getmem_nbi_on_stream(dst+(dstdisp_h[i]-dstdisp_h[0])*link->unitbytes,src+srcdisp_h[i]*link->unitbytes,nelems,pe,link->remoteCommStream);
435:       }
436:     }
437:   }
438:   return(0);
439: }

441: /* Finish the communication (can be done before Unpack)
442:    Receiver tells its senders that they are allowed to reuse their send buffer (since receiver has got data from their send buffer)
443: */
444: PetscErrorCode PetscSFLinkGetDataEnd_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
445: {
446:   PetscErrorCode    ierr;
447:   cudaError_t       cerr;
448:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
449:   uint64_t          *srcsig;
450:   PetscInt          nsrcranks,*srcsigdisp;
451:   PetscMPIInt       *srcranks;

454:   if (direction == PETSCSF_../../../../../..2LEAF) { /* leaf ranks are getting data */
455:     nsrcranks   = sf->nRemoteRootRanks;
456:     srcsig      = link->rootSendSig;     /* I want to set their root signal */
457:     srcsigdisp  = sf->rootsigdisp_d;     /* offset of each root signal */
458:     srcranks    = sf->ranks_d;           /* ranks of the n root ranks */
459:   } else { /* LEAF2../../../../../.., root ranks are getting data */
460:     nsrcranks   = bas->nRemoteLeafRanks;
461:     srcsig      = link->leafSendSig;
462:     srcsigdisp  = bas->leafsigdisp_d;
463:     srcranks    = bas->iranks_d;
464:   }

466:   if (nsrcranks) {
467:     nvshmemx_quiet_on_stream(link->remoteCommStream); /* Finish the nonblocking get, so that we can unpack afterwards */
468:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
469:     NvshmemSendSignals<<<(nsrcranks+511)/512,512,0,link->remoteCommStream>>>(nsrcranks,srcsig,srcsigdisp,srcranks,0); /* set signals to 0 */
470:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
471:   }
472:   PetscSFLinkBuildDependenceEnd(sf,link,direction);
473:   return(0);
474: }

476: /* ===========================================================================================================

478:    A set of routines to support sender initiated communication using the put-based method (the default)

480:     The putting protocol is:

482:     Sender has a send buf (sbuf) and a send signal var (ssig);  Receiver has a stand-alone recv buf (rbuf)
483:     and a recv signal var (rsig); All signal variables have an initial value 0. rbuf is allocated by SF and
484:     is in nvshmem space.

486:     Sender:                                 |  Receiver:
487:                                             |
488:   1.  Pack data into sbuf                   |
489:   2.  Wait ssig be 0, then set it to 1      |
490:   3.  Put data to remote stand-alone rbuf   |
491:   4.  Fence // make sure 5 happens after 3  |
492:   5.  Put 1 to receiver's rsig              |   1. Wait rsig to be 1, then set it 0
493:                                             |   2. Unpack data from local rbuf
494:                                             |   3. Put 0 to sender's ssig
495:    ===========================================================================================================*/

497: /* n thread blocks. Each takes in charge one remote rank */
498: __global__ static void WaitAndPutDataToRemotelyAccessible(PetscInt ndstranks,PetscMPIInt *dstranks,char *dst,PetscInt *dstdisp,const char *src,PetscInt *srcdisp,uint64_t *srcsig,PetscInt unitbytes)
499: {
500:   int               bid = blockIdx.x;
501:   PetscMPIInt       pe  = dstranks[bid];

503:   if (!nvshmem_ptr(dst,pe)) {
504:     PetscInt nelems = (srcdisp[bid+1]-srcdisp[bid])*unitbytes;
505:     nvshmem_uint64_wait_until(srcsig+bid,NVSHMEM_CMP_EQ,0); /* Wait until the sig = 0 */
506:     srcsig[bid] = 1;
507:     nvshmem_putmem_nbi(dst+dstdisp[bid]*unitbytes,src+(srcdisp[bid]-srcdisp[0])*unitbytes,nelems,pe);
508:   }
509: }

511: /* one-thread kernel, which takes in charge all locally accesible */
512: __global__ static void WaitSignalsFromLocallyAccessible(PetscInt ndstranks,PetscMPIInt *dstranks,uint64_t *srcsig,const char *dst)
513: {
514:   for (int i=0; i<ndstranks; i++) {
515:     int pe = dstranks[i];
516:     if (nvshmem_ptr(dst,pe)) {
517:       nvshmem_uint64_wait_until(srcsig+i,NVSHMEM_CMP_EQ,0); /* Wait until the sig = 0 */
518:       srcsig[i] = 1;
519:     }
520:   }
521: }

523: /* Put data in the given direction  */
524: PetscErrorCode PetscSFLinkPutDataBegin_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
525: {
526:   PetscErrorCode    ierr;
527:   cudaError_t       cerr;
528:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
529:   PetscInt          ndstranks,nLocallyAccessible = 0;
530:   char              *src,*dst;
531:   PetscInt          *srcdisp_h,*dstdisp_h;
532:   PetscInt          *srcdisp_d,*dstdisp_d;
533:   PetscMPIInt       *dstranks_h;
534:   PetscMPIInt       *dstranks_d;
535:   uint64_t          *srcsig;

538:   PetscSFLinkBuildDependenceBegin(sf,link,direction);
539:   if (direction == PETSCSF_../../../../../..2LEAF) { /* put data in rootbuf to leafbuf  */
540:     ndstranks    = bas->nRemoteLeafRanks; /* number of (remote) leaf ranks */
541:     src          = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* Both src & dst must be symmetric */
542:     dst          = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];

544:     srcdisp_h    = bas->ioffset+bas->ndiranks;  /* offsets of rootbuf. srcdisp[0] is not necessarily zero */
545:     srcdisp_d    = bas->ioffset_d;
546:     srcsig       = link->rootSendSig;

548:     dstdisp_h    = bas->leafbufdisp;            /* for my i-th remote leaf rank, I will access its leaf buf at offset leafbufdisp[i] */
549:     dstdisp_d    = bas->leafbufdisp_d;
550:     dstranks_h   = bas->iranks+bas->ndiranks;   /* remote leaf ranks */
551:     dstranks_d   = bas->iranks_d;
552:   } else { /* put data in leafbuf to rootbuf */
553:     ndstranks    = sf->nRemoteRootRanks;
554:     src          = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
555:     dst          = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];

557:     srcdisp_h    = sf->roffset+sf->ndranks; /* offsets of leafbuf */
558:     srcdisp_d    = sf->roffset_d;
559:     srcsig       = link->leafSendSig;

561:     dstdisp_h    = sf->rootbufdisp;         /* for my i-th remote root rank, I will access its root buf at offset rootbufdisp[i] */
562:     dstdisp_d    = sf->rootbufdisp_d;
563:     dstranks_h   = sf->ranks+sf->ndranks;   /* remote root ranks */
564:     dstranks_d   = sf->ranks_d;
565:   }

567:   /* Wait for signals and then put data to dst ranks using non-blocking nvshmem_put, which are finished in PetscSFLinkPutDataEnd_NVSHMEM */

569:   /* Count number of locally accessible neighbors, which should be a small number */
570:   for (int i=0; i<ndstranks; i++) {if (nvshmem_ptr(dst,dstranks_h[i])) nLocallyAccessible++;}

572:   /* For remotely accessible PEs, send data to them in one kernel call */
573:   if (nLocallyAccessible < ndstranks) {
574:     WaitAndPutDataToRemotelyAccessible<<<ndstranks,1,0,link->remoteCommStream>>>(ndstranks,dstranks_d,dst,dstdisp_d,src,srcdisp_d,srcsig,link->unitbytes);
575:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
576:   }

578:   /* For locally accessible PEs, use host API, which uses CUDA copy-engines and is much faster than device API */
579:   if (nLocallyAccessible) {
580:     WaitSignalsFromLocallyAccessible<<<1,1,0,link->remoteCommStream>>>(ndstranks,dstranks_d,srcsig,dst);
581:     for (int i=0; i<ndstranks; i++) {
582:       int pe = dstranks_h[i];
583:       if (nvshmem_ptr(dst,pe)) { /* If return a non-null pointer, then <pe> is locally accessible */
584:         size_t nelems = (srcdisp_h[i+1]-srcdisp_h[i])*link->unitbytes;
585:          /* Initiate the nonblocking communication */
586:         nvshmemx_putmem_nbi_on_stream(dst+dstdisp_h[i]*link->unitbytes,src+(srcdisp_h[i]-srcdisp_h[0])*link->unitbytes,nelems,pe,link->remoteCommStream);
587:       }
588:     }
589:   }

591:   if (nLocallyAccessible) {
592:     nvshmemx_quiet_on_stream(link->remoteCommStream); /* Calling nvshmem_fence/quiet() does not fence the above nvshmemx_putmem_nbi_on_stream! */
593:   }
594:   return(0);
595: }

597: /* A one-thread kernel. The thread takes in charge all remote PEs */
598: __global__ static void PutDataEnd(PetscInt nsrcranks,PetscInt ndstranks,PetscMPIInt *dstranks,uint64_t *dstsig,PetscInt *dstsigdisp)
599: {
600:   /* TODO: Shall we finished the non-blocking remote puts? */

602:   /* 1. Send a signal to each dst rank */

604:   /* According to Akhil@NVIDIA, IB is orderred, so no fence is needed for remote PEs.
605:      For local PEs, we already called nvshmemx_quiet_on_stream(). Therefore, we are good to send signals to all dst ranks now.
606:   */
607:   for (int i=0; i<ndstranks; i++) {nvshmemx_uint64_signal(dstsig+dstsigdisp[i],1,dstranks[i]);} /* set sig to 1 */

609:   /* 2. Wait for signals from src ranks (if any) */
610:   if (nsrcranks) {
611:     nvshmem_uint64_wait_until_all(dstsig,nsrcranks,NULL/*no mask*/,NVSHMEM_CMP_EQ,1); /* wait sigs to be 1, then set them to 0 */
612:     for (int i=0; i<nsrcranks; i++) dstsig[i] = 0;
613:   }
614: }

616: /* Finish the communication -- A receiver waits until it can access its receive buffer */
617: PetscErrorCode PetscSFLinkPutDataEnd_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
618: {
619:   PetscErrorCode    ierr;
620:   cudaError_t       cerr;
621:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
622:   PetscMPIInt       *dstranks;
623:   uint64_t          *dstsig;
624:   PetscInt          nsrcranks,ndstranks,*dstsigdisp;

627:   if (direction == PETSCSF_../../../../../..2LEAF) { /* put root data to leaf */
628:     nsrcranks    = sf->nRemoteRootRanks;

630:     ndstranks    = bas->nRemoteLeafRanks;
631:     dstranks     = bas->iranks_d;       /* leaf ranks */
632:     dstsig       = link->leafRecvSig;   /* I will set my leaf ranks's RecvSig */
633:     dstsigdisp   = bas->leafsigdisp_d;  /* for my i-th remote leaf rank, I will access its signal at offset leafsigdisp[i] */
634:   } else { /* LEAF2../../../../../.. */
635:     nsrcranks    = bas->nRemoteLeafRanks;

637:     ndstranks    = sf->nRemoteRootRanks;
638:     dstranks     = sf->ranks_d;
639:     dstsig       = link->rootRecvSig;
640:     dstsigdisp   = sf->rootsigdisp_d;
641:   }

643:   if (nsrcranks || ndstranks) {
644:     PutDataEnd<<<1,1,0,link->remoteCommStream>>>(nsrcranks,ndstranks,dstranks,dstsig,dstsigdisp);
645:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
646:   }
647:   PetscSFLinkBuildDependenceEnd(sf,link,direction);
648:   return(0);
649: }

651: /* PostUnpack operation -- A receiver tells its senders that they are allowed to put data to here (it implies recv buf is free to take new data) */
652: PetscErrorCode PetscSFLinkSendSignalsToAllowPuttingData_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
653: {
654:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
655:   uint64_t          *srcsig;
656:   PetscInt          nsrcranks,*srcsigdisp_d;
657:   PetscMPIInt       *srcranks_d;

660:   if (direction == PETSCSF_../../../../../..2LEAF) { /* I allow my root ranks to put data to me */
661:     nsrcranks    = sf->nRemoteRootRanks;
662:     srcsig       = link->rootSendSig;      /* I want to set their send signals */
663:     srcsigdisp_d = sf->rootsigdisp_d;      /* offset of each root signal */
664:     srcranks_d   = sf->ranks_d;            /* ranks of the n root ranks */
665:   } else { /* LEAF2../../../../../.. */
666:     nsrcranks    = bas->nRemoteLeafRanks;
667:     srcsig       = link->leafSendSig;
668:     srcsigdisp_d = bas->leafsigdisp_d;
669:     srcranks_d   = bas->iranks_d;
670:   }

672:   if (nsrcranks) {
673:     NvshmemSendSignals<<<(nsrcranks+255)/256,256,0,link->remoteCommStream>>>(nsrcranks,srcsig,srcsigdisp_d,srcranks_d,0); /* Set remote signals to 0 */
674:     cudaError_t cerr = cudaGetLastError();CHKERRCUDA(cerr);
675:   }
676:   return(0);
677: }

679: /* Destructor when the link uses nvshmem for communication */
680: static PetscErrorCode PetscSFLinkDestroy_NVSHMEM(PetscSF sf,PetscSFLink link)
681: {
682:   PetscErrorCode    ierr;
683:   cudaError_t       cerr;

686:   cerr = cudaEventDestroy(link->dataReady);CHKERRCUDA(cerr);
687:   cerr = cudaEventDestroy(link->endRemoteComm);CHKERRCUDA(cerr);
688:   cerr = cudaStreamDestroy(link->remoteCommStream);CHKERRCUDA(cerr);

690:   /* nvshmem does not need buffers on host, which should be NULL */
691:   PetscNvshmemFree(link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
692:   PetscNvshmemFree(link->leafSendSig);
693:   PetscNvshmemFree(link->leafRecvSig);
694:   PetscNvshmemFree(link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
695:   PetscNvshmemFree(link->rootSendSig);
696:   PetscNvshmemFree(link->rootRecvSig);
697:   return(0);
698: }

700: PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
701: {
702:   PetscErrorCode    ierr;
703:   cudaError_t       cerr;
704:   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
705:   PetscSFLink       *p,link;
706:   PetscBool         match,rootdirect[2],leafdirect[2];
707:   int               greatestPriority;

710:   /* Check to see if we can directly send/recv root/leafdata with the given sf, sfop and op.
711:      We only care root/leafdirect[PETSCSF_REMOTE], since we never need intermeidate buffers in local communication with NVSHMEM.
712:   */
713:   if (sfop == PETSCSF_BCAST) { /* Move data from rootbuf to leafbuf */
714:     if (sf->use_nvshmem_get) {
715:       rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* send buffer has to be stand-alone (can't be rootdata) */
716:       leafdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(leafmtype) && sf->leafcontig[PETSCSF_REMOTE] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;
717:     } else {
718:       rootdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(rootmtype) && bas->rootcontig[PETSCSF_REMOTE]) ? PETSC_TRUE : PETSC_FALSE;
719:       leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;  /* Our put-protocol always needs a nvshmem alloc'ed recv buffer */
720:     }
721:   } else if (sfop == PETSCSF_REDUCE) { /* Move data from leafbuf to rootbuf */
722:     if (sf->use_nvshmem_get) {
723:       rootdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(rootmtype) && bas->rootcontig[PETSCSF_REMOTE] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;
724:       leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;
725:     } else {
726:       rootdirect[PETSCSF_REMOTE] = PETSC_FALSE;
727:       leafdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(leafmtype) && sf->leafcontig[PETSCSF_REMOTE]) ? PETSC_TRUE : PETSC_FALSE;
728:     }
729:   } else { /* PETSCSF_FETCH */
730:     rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
731:     leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
732:   }

734:   /* Look for free nvshmem links in cache */
735:   for (p=&bas->avail; (link=*p); p=&link->next) {
736:     if (link->use_nvshmem) {
737:       MPIPetsc_Type_compare(unit,link->unit,&match);
738:       if (match) {
739:         *p = link->next; /* Remove from available list */
740:         goto found;
741:       }
742:     }
743:   }
744:   PetscNew(&link);
745:   PetscSFLinkSetUp_Host(sf,link,unit); /* Compute link->unitbytes, dup link->unit etc. */
746:   if (sf->backend == PETSCSF_BACKEND_CUDA) {PetscSFLinkSetUp_CUDA(sf,link,unit);} /* Setup pack routines, streams etc */
747:  #if defined(PETSC_HAVE_KOKKOS)
748:   else if (sf->backend == PETSCSF_BACKEND_KOKKOS) {PetscSFLinkSetUp_Kokkos(sf,link,unit);}
749:  #endif

751:   link->rootdirect[PETSCSF_LOCAL]  = PETSC_TRUE; /* For the local part we directly use root/leafdata */
752:   link->leafdirect[PETSCSF_LOCAL]  = PETSC_TRUE;

754:   /* Init signals to zero */
755:   if (!link->rootSendSig) {PetscNvshmemCalloc(bas->nRemoteLeafRanksMax*sizeof(uint64_t),(void**)&link->rootSendSig);}
756:   if (!link->rootRecvSig) {PetscNvshmemCalloc(bas->nRemoteLeafRanksMax*sizeof(uint64_t),(void**)&link->rootRecvSig);}
757:   if (!link->leafSendSig) {PetscNvshmemCalloc(sf->nRemoteRootRanksMax*sizeof(uint64_t),(void**)&link->leafSendSig);}
758:   if (!link->leafRecvSig) {PetscNvshmemCalloc(sf->nRemoteRootRanksMax*sizeof(uint64_t),(void**)&link->leafRecvSig);}

760:   link->use_nvshmem                = PETSC_TRUE;
761:   link->rootmtype                  = PETSC_MEMTYPE_DEVICE; /* Only need 0/1-based mtype from now on */
762:   link->leafmtype                  = PETSC_MEMTYPE_DEVICE;
763:   /* Overwrite some function pointers set by PetscSFLinkSetUp_CUDA */
764:   link->Destroy                    = PetscSFLinkDestroy_NVSHMEM;
765:   if (sf->use_nvshmem_get) { /* get-based protocol */
766:     link->PrePack                  = PetscSFLinkWaitSignalsOfCompletionOfGettingData_NVSHMEM;
767:     link->StartCommunication       = PetscSFLinkGetDataBegin_NVSHMEM;
768:     link->FinishCommunication      = PetscSFLinkGetDataEnd_NVSHMEM;
769:   } else { /* put-based protocol */
770:     link->StartCommunication       = PetscSFLinkPutDataBegin_NVSHMEM;
771:     link->FinishCommunication      = PetscSFLinkPutDataEnd_NVSHMEM;
772:     link->PostUnpack               = PetscSFLinkSendSignalsToAllowPuttingData_NVSHMEM;
773:   }

775:   cerr = cudaDeviceGetStreamPriorityRange(NULL,&greatestPriority);CHKERRCUDA(cerr);
776:   cerr = cudaStreamCreateWithPriority(&link->remoteCommStream,cudaStreamNonBlocking,greatestPriority);CHKERRCUDA(cerr);

778:   cerr = cudaEventCreateWithFlags(&link->dataReady,cudaEventDisableTiming);CHKERRCUDA(cerr);
779:   cerr = cudaEventCreateWithFlags(&link->endRemoteComm,cudaEventDisableTiming);CHKERRCUDA(cerr);

781: found:
782:   if (rootdirect[PETSCSF_REMOTE]) {
783:     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = (char*)rootdata + bas->rootstart[PETSCSF_REMOTE]*link->unitbytes;
784:   } else {
785:     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]) {
786:       PetscNvshmemMalloc(bas->rootbuflen_rmax*link->unitbytes,(void**)&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
787:     }
788:     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
789:   }

791:   if (leafdirect[PETSCSF_REMOTE]) {
792:     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = (char*)leafdata + sf->leafstart[PETSCSF_REMOTE]*link->unitbytes;
793:   } else {
794:     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]) {
795:       PetscNvshmemMalloc(sf->leafbuflen_rmax*link->unitbytes,(void**)&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
796:     }
797:     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
798:   }

800:   link->rootdirect[PETSCSF_REMOTE] = rootdirect[PETSCSF_REMOTE];
801:   link->leafdirect[PETSCSF_REMOTE] = leafdirect[PETSCSF_REMOTE];
802:   link->rootdata                   = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
803:   link->leafdata                   = leafdata;
804:   link->next                       = bas->inuse;
805:   bas->inuse                       = link;
806:   *mylink                          = link;
807:   return(0);
808: }

810: #if defined(PETSC_USE_REAL_SINGLE)
811: PetscErrorCode PetscNvshmemSum(PetscInt count,float *dst,const float *src)
812: {
813:   PetscErrorCode    ierr;
814:   PetscMPIInt       num; /* Assume nvshmem's int is MPI's int */

817:   PetscMPIIntCast(count,&num);
818:   nvshmemx_float_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
819:   return(0);
820: }

822: PetscErrorCode PetscNvshmemMax(PetscInt count,float *dst,const float *src)
823: {
824:   PetscErrorCode    ierr;
825:   PetscMPIInt       num;

828:   PetscMPIIntCast(count,&num);
829:   nvshmemx_float_max_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
830:   return(0);
831: }
832: #elif defined(PETSC_USE_REAL_DOUBLE)
833: PetscErrorCode PetscNvshmemSum(PetscInt count,double *dst,const double *src)
834: {
835:   PetscErrorCode    ierr;
836:   PetscMPIInt       num;

839:   PetscMPIIntCast(count,&num);
840:   nvshmemx_double_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
841:   return(0);
842: }

844: PetscErrorCode PetscNvshmemMax(PetscInt count,double *dst,const double *src)
845: {
846:   PetscErrorCode    ierr;
847:   PetscMPIInt       num;

850:   PetscMPIIntCast(count,&num);
851:   nvshmemx_double_max_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
852:   return(0);
853: }
854: #endif