1: /* Mainly for MPI_Isend in SFBASIC. Once SFNEIGHBOR, SFALLGHATERV etc have a persistent version,
2: we can also do abstractions like Prepare/StartCommunication.
3: */
5: #include <../src/vec/is/sf/impls/basic/sfpack.h> 7: /* Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf */
8: static PetscErrorCode PetscSFLinkStartRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 9: {
10: PetscErrorCode ierr;
11: PetscMPIInt nreqs;
12: MPI_Request *reqs = NULL;
13: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
14: PetscInt buflen;
17: buflen = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
18: if (buflen) {
19: if (direction == PETSCSF_../../../../../..2LEAF) {
20: nreqs = sf->nleafreqs;
21: PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);
22: } else { /* leaf to root */
23: nreqs = bas->nrootreqs;
24: PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);
25: }
26: MPI_Startall_irecv(buflen,link->unit,nreqs,reqs);
27: }
29: buflen = (direction == PETSCSF_../../../../../..2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
30: if (buflen) {
31: if (direction == PETSCSF_../../../../../..2LEAF) {
32: nreqs = bas->nrootreqs;
33: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host before sending */);
34: PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,&reqs,NULL);
35: } else { /* leaf to root */
36: nreqs = sf->nleafreqs;
37: PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);
38: PetscSFLinkGetMPIBuffersAndRequests(sf,link,direction,NULL,NULL,NULL,&reqs);
39: }
40: PetscSFLinkSyncStreamBeforeCallMPI(sf,link,direction);
41: MPI_Startall_isend(buflen,link->unit,nreqs,reqs);
42: }
43: return(0);
44: }
46: static PetscErrorCode PetscSFLinkWaitRequests_MPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 47: {
48: PetscErrorCode ierr;
49: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
50: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
51: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
54: MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);
55: MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);
56: if (direction == PETSCSF_../../../../../..2LEAF) {
57: PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE/* host2device after recving */);
58: } else {
59: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);
60: }
61: return(0);
62: }
64: /*
65: The routine Creates a communication link for the given operation. It first looks up its link cache. If
66: there is a free & suitable one, it uses it. Otherwise it creates a new one.
68: A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
69: root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
70: can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
71: those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
73: The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
75: In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
77: The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
78: need pack/unpack data.
79: */
80: PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 81: {
82: PetscErrorCode ierr;
83: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
84: PetscInt i,j,k,nrootreqs,nleafreqs,nreqs;
85: PetscSFLink *p,link;
86: PetscSFDirection direction;
87: MPI_Request *reqs = NULL;
88: PetscBool match,rootdirect[2],leafdirect[2];
89: PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
90: PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
91: PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */
92: PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
96: /* Can we directly use root/leafdirect with the given sf, sfop and op? */
97: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
98: if (sfop == PETSCSF_BCAST) {
99: rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
100: leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
101: } else if (sfop == PETSCSF_REDUCE) {
102: leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */
103: rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
104: } else { /* PETSCSF_FETCH */
105: rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
106: leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
107: }
108: }
110: if (sf->use_gpu_aware_mpi) {
111: rootmtype_mpi = rootmtype;
112: leafmtype_mpi = leafmtype;
113: } else {
114: rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
115: }
116: /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
117: rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
118: leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
120: direction = (sfop == PETSCSF_BCAST)? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
121: nrootreqs = bas->nrootreqs;
122: nleafreqs = sf->nleafreqs;
124: /* Look for free links in cache */
125: for (p=&bas->avail; (link=*p); p=&link->next) {
126: if (!link->use_nvshmem) { /* Only check with MPI links */
127: MPIPetsc_Type_compare(unit,link->unit,&match);
128: if (match) {
129: /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
130: If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
131: */
132: if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
133: reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
134: for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
135: link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
136: }
137: if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
138: reqs = link->leafreqs[direction][leafmtype][1];
139: for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
140: link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
141: }
142: *p = link->next; /* Remove from available list */
143: goto found;
144: }
145: }
146: }
148: PetscNew(&link);
149: PetscSFLinkSetUp_Host(sf,link,unit);
150: PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag); /* One tag per link */
152: nreqs = (nrootreqs+nleafreqs)*8;
153: PetscMalloc1(nreqs,&link->reqs);
154: for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
156: for (i=0; i<2; i++) { /* Two communication directions */
157: for (j=0; j<2; j++) { /* Two memory types */
158: for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
159: link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
160: link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
161: }
162: }
163: }
164: link->StartCommunication = PetscSFLinkStartRequests_MPI;
165: link->FinishCommunication = PetscSFLinkWaitRequests_MPI;
167: found:169: #if defined(PETSC_HAVE_DEVICE)
170: if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
171: #if defined(PETSC_HAVE_CUDA)
172: if (sf->backend == PETSCSF_BACKEND_CUDA) {PetscSFLinkSetUp_CUDA(sf,link,unit);} /* Setup streams etc */
173: #endif
174: #if defined(PETSC_HAVE_HIP)
175: if (sf->backend == PETSCSF_BACKEND_HIP) {PetscSFLinkSetUp_HIP(sf,link,unit);} /* Setup streams etc */
176: #endif
177: #if defined(PETSC_HAVE_KOKKOS)
178: if (sf->backend == PETSCSF_BACKEND_KOKKOS) {PetscSFLinkSetUp_Kokkos(sf,link,unit);}
179: #endif
180: }
181: #endif
183: /* Allocate buffers along root/leafdata */
184: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
185: /* For local communication, buffers are only needed when roots and leaves have different mtypes */
186: if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
187: if (bas->rootbuflen[i]) {
188: if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
189: link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
190: } else { /* Have to have a separate rootbuf */
191: if (!link->rootbuf_alloc[i][rootmtype]) {
192: PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);
193: }
194: link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
195: }
196: }
198: if (sf->leafbuflen[i]) {
199: if (leafdirect[i]) {
200: link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
201: } else {
202: if (!link->leafbuf_alloc[i][leafmtype]) {
203: PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);
204: }
205: link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
206: }
207: }
208: }
210: #if defined(PETSC_HAVE_DEVICE)
211: /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
212: if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
213: if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
214: PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
215: }
216: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
217: }
218: if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
219: if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
220: PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
221: }
222: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
223: }
224: #endif
226: /* Set `current` state of the link. They may change between different SF invocations with the same link */
227: if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
228: if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
229: if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
230: }
232: link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
233: link->leafdata = leafdata;
234: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
235: link->rootdirect[i] = rootdirect[i];
236: link->leafdirect[i] = leafdirect[i];
237: }
238: link->rootdirect_mpi = rootdirect_mpi;
239: link->leafdirect_mpi = leafdirect_mpi;
240: link->rootmtype = rootmtype;
241: link->leafmtype = leafmtype;
242: link->rootmtype_mpi = rootmtype_mpi;
243: link->leafmtype_mpi = leafmtype_mpi;
245: link->next = bas->inuse;
246: bas->inuse = link;
247: *mylink = link;
248: return(0);
249: }