Actual source code: cupminit.inc

petsc-3.15.0 2021-03-30
Report Typos and Errors
  1: /* A template file for the CUDA Programming Model (CUPM) initialization, to be included in init.c. CUPM is either CUDA or HIP. */

  3: PetscBool PetscCUPMSynchronize     = PETSC_FALSE;
  4: PetscBool PetscCUPMInitialized     = PETSC_FALSE;

  6: cupmStream_t  PetscDefaultCupmStream = NULL;

  8: static PetscBool PetscNotUseCUPM   = PETSC_FALSE; /* Assert the code will not use this type of devices */

 10: /* Device validation after it is lazily initialized */
 11: static PetscErrorCode PetscCUPMValidate(void)
 12: {
 13:   PetscBool  mpi_gpu_awareness;

 16:   if (use_gpu_aware_mpi) {
 17: #if defined(PETSC_HAVE_OMPI_MAJOR_VERSION) && defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 18:     /* Trust OpenMPI's compile time gpu query interface */
 19:     mpi_gpu_awareness = PETSC_TRUE;
 20: #else
 21:     /* For other MPI implementations without gpu query API, we do a GPU MPI call to see if it segfaults.
 22:       Note that Spectrum MPI sets OMPI_MAJOR_VERSION and is CUDA-aware, but does not have MPIX_CUDA_AWARE_SUPPORT.
 23:     */
 24:     mpi_gpu_awareness = PetscMPICUPMAwarenessCheck();
 25: #endif
 26:     if (!mpi_gpu_awareness) {
 27:       (*PetscErrorPrintf)("PETSc is configured with GPU support, but your MPI is not GPU-aware. For better performance, please use a GPU-aware MPI.\n");
 28:       (*PetscErrorPrintf)("If you do not care, add option -use_gpu_aware_mpi 0. To not see the message again, add the option to your .petscrc, OR add it to the env var PETSC_OPTIONS.\n");
 29:       (*PetscErrorPrintf)("If you do care, for IBM Spectrum MPI on OLCF Summit, you may need jsrun --smpiargs=-gpu.\n");
 30:       (*PetscErrorPrintf)("For OpenMPI, you need to configure it --with-cuda (https://www.open-mpi.org/faq/?category=buildcuda)\n");
 31:       (*PetscErrorPrintf)("For MVAPICH2-GDR, you need to set MV2_USE_CUDA=1 (http://mvapich.cse.ohio-state.edu/userguide/gdr/)\n");
 32:       (*PetscErrorPrintf)("For Cray-MPICH, you need to set MPICH_RDMA_ENABLED_CUDA=1 (https://www.olcf.ornl.gov/tutorials/gpudirect-mpich-enabled-cuda/)\n");
 33:       PETSCABORT(PETSC_COMM_SELF,PETSC_ERR_LIB);
 34:     }
 35:   }
 36:   return(0);
 37: }

 39: /*@C
 40:      PetscCUDAInitializeCheck - Check if CUDA is initialized. If not, initialize it.

 42:   Logically collective

 44:   Level: beginner

 46:   Notes:
 47:     In PETSc lazy device initialization, PETSc calls this function right before creating the first CUDA/HIP object.
 48:     It can be used by application developers who want to lazily initialize CUDA/HIP when they start to use it (which may before a PETSc CUDA/HIP object is created.)

 50:   .seealso: PetscCUDAInitialize(), PetscHIPInitialize(), PetscHIPInitializeCheck()
 51: @*/
 52: PETSC_EXTERN PetscErrorCode PetscCUDAInitializeCheck(void);


 55: /*@C
 56:      PetscHIPInitializeCheck - Check if HIP is initialized. If not, initialize it.

 58:   Logically collective

 60:   Level: beginner

 62:   Notes:
 63:     See notes of PetscCUDAInitializeCheck() for details.

 65:   .seealso: PetscHIPInitialize(), PetscCUDAInitialize(), PetscCUDAInitializeCheck()
 66: @*/
 67: PETSC_EXTERN PetscErrorCode PetscHIPInitializeCheck(void);

 69: PetscErrorCode PetscCUPMInitializeCheck(void)
 70: {
 71:   PetscErrorCode        ierr;
 72:   cupmError_t           cerr;
 73:   int                   devId,devCount;
 74:   PetscMPIInt           rank;
 75:   static PetscBool      cupmValdidateChecked = PETSC_FALSE;
 76:   PetscBool             useNull = PETSC_TRUE; /* use the default (NULL) stream as petsc's default stream */

 79:   if (PetscNotUseCUPM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"You asserted the code wouldn't use devices with -device_set none, but now trying to create a device object. Remove this option or see manpage of PetscCUPMInitialize().");
 80:   if (!PetscCUPMInitialized) {
 81:     cerr = cupmGetDeviceCount(&devCount);
 82:     cupmGetLastError(); /* Reset the last error */
 83:     if (cerr != cupmSuccess) devCount = 0;
 84:     if (devCount > 0) {
 85:       cerr = cupmSetDeviceFlags(cupmDeviceMapHost);
 86:       cupmGetLastError(); /* Reset the last error */
 87:       if (cerr == cupmSuccess) { /* It implies device runtime has not been initialized? */
 88:         MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
 89:         devId = rank % devCount;
 90:         for (int i=0; i<3; i++) {
 91:           cerr = cupmSetDevice(devId);
 92:           if (cerr == cupmSuccess) break;
 93:           if (cerr != cupmErrorMemoryAllocation && cerr != cupmErrorLaunchOutOfResources) CHKERRCUPM(cerr);
 94:           if (i < 2) {PetscSleep(3);}
 95:         }
 96:         if (cerr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU_RESOURCE,"Unable to initialize the GPU");
 97:       } else if (cerr == cupmErrorSetOnActiveProcess) {
 98:         /* It implies user has initialized device runtime outside of petsc. We do nothing to respect the device choice. */
 99:       }
100:     }
101:     PetscOptionsGetBool(NULL,NULL,"-petsc_default_use_null_stream",&useNull,NULL);
102:     if (!useNull) {cerr = cupmStreamCreate(&PetscDefaultCupmStream);CHKERRCUPM(cerr);}
103:     PetscCUPMBLASInitializeHandle();
104:     PetscCUPMSOLVERDnInitializeHandle();
105:     PetscCUPMInitialized = PETSC_TRUE;
106:   }

108:   if (!cupmValdidateChecked) {
109:     PetscCUPMValidate();
110:     cupmValdidateChecked = PETSC_TRUE;
111:   }
112:   PetscCreatedGpuObjects = PETSC_TRUE;
113:   return(0);
114: }

116: /*@C
117:      PetscCUDAInitialize - Initializes CUDA (eagerly in PetscInitialize() or soon after PetscInitialize()) and cuBLAS/cuSPARSE libraries on the device

119:      Logically collective

121:   Input Parameter:
122: + comm   - the MPI communicator that will utilize the devices
123: - device - the device assigned to current MPI process. Special values like PETSC_DECIDE or PETSC_DEFAULT have special meanings (see details below)

125:   Options Database:
126: +  -cuda_device <device> - the device assigned to current MPI rank. <device> is case-insensitive and can be:
127:        NONE (or none, or -3) : the code will not use any device, otherwise it will error out;
128:        PETSC_DEFAULT(or DEFAULT, or -2) : do not explicitly set device, i.e., use whatever device already set by user (probably before PetscInitialize()). Init device runtime etc;
129:        PETSC_DECIDE (or DECIDE, or -1) : assign MPI ranks in comm to available devices in round-robin, and init device runtime etc on the selected device;
130:        >= 0 integer  : assign the device with this id to current MPI process. Error out if <device> is invalid. Init device runtime etc on this device;
131:      With PETSC_{DECIDE, DEFAULT}, if there are actually no devices, the code can still run, but it will error out when trying to create device objects.
132: .  -cuda_view              - view information about the devices.
133: .  -cuda_synchronize       - wait at the end of asynchronize device calls so that their time gets credited to the current event. With -log_view, the default is true, otherwise false.
134: .  -log_view               - logging, however if alone or combined with `-cuda_device DEFAULT | DECIDE | >=0 int`, will init device; if combined with `-cuda_device none`, won't init device.
135: .  -petsc_default_use_null_stream   - If true (default), petsc will use the default NULL stream to launch its kernels and call vendor libraries such as cuBLAS, cuSPARSE etc.
136: -  -use_gpu_aware_mpi      - assume the MPI is device/GPU-aware when communicating data on devices. Default true.

138:   Level: beginner

140:   Notes:
141:     Unless the input parameter <device> = -3, this routine initializes the CUDA device. It also initializes the cuBLAS/cuSPARSE libraries, which
142:     takes a lot of time. Initializing them early helps avoid skewing timings in -log_view.

144:     If this routine is triggered by command line options, it is called in PetscInitialize(). If users want to directly call it, they should call it immediately after PetscInitialize().

146:     If this is not called then the CUDA initialization is delayed until first creation of a CUDA object and this can affect the timing since they happen asynchronously on different nodes and take a lot of time.

148:    .seealso: PetscCUDAInitializeCheck(), PetscHIPInitialize(), PetscHIPInitializeCheck()
149: @*/
150: PETSC_EXTERN PetscErrorCode PetscCUDAInitialize(MPI_Comm comm,PetscInt device);
151: /*@C
152:      PetscHIPInitialize - Initializes HIP (eagerly in PetscInitialize() or soon after PetscInitialize()) and hipBLAS/hipSPARSE libraries on the device

154:      Logically collective

156:   Input Parameter:
157:    (see notes)

159:   Options Database:
160:    (see notes)

162:   Level: beginner

164:   Notes:
165:     The functionality, parameters and options database of this routine are similar to that of PetscCUDAInitialize(), except that the option names
166:     are -hip_device, -hip_view, -hip_synchronize instead. See manpage of PetscCUDAInitialize() for details.

168:   .seealso: PetscHIPInitializeCheck(), PetscCUDAInitialize(), PetscCUDAInitializeCheck()
169: @*/
170: PETSC_EXTERN PetscErrorCode PetscHIPInitialize(MPI_Comm comm,PetscInt device);

172: PetscErrorCode PetscCUPMInitialize(MPI_Comm comm,PetscInt device)
173: {
174:   PetscErrorCode        ierr;
175:   cupmError_t           cerr;
176:   int                   devId,devCount=0;
177:   const PetscInt        PETSC_NONE=-3; /* Unlike PETSC_DECIDE, we don't have a macro PETSC_NONE in petsc headers */
178:   PetscMPIInt           rank;

181:   if (!PetscCUPMInitialized) {
182:     cerr = cupmGetDeviceCount(&devCount);
183:     cupmGetLastError(); /* Reset the last error */
184:     if (cerr != cupmSuccess) devCount = 0;
185:     if (device >= 0) { /* User wants to use this specific device */
186:       cerr = cupmSetDeviceFlags(cupmDeviceMapHost); /* Allow it to fail since user might have already initialized the device. */
187:       cupmGetLastError(); /* Reset the last error */
188:       cerr = cupmSetDevice((int)device);CHKERRCUPM(cerr);
189:     } else if (device == PETSC_DECIDE) { /* Assign MPI ranks to available devices in round-robin */
190:       if (devCount > 0) { /* Allow no device as long as user does not use devices */
191:         /* Set the device flags so that it can map host memory */
192:         cerr  = cupmSetDeviceFlags(cupmDeviceMapHost);CHKERRCUPM(cerr);
193:         MPI_Comm_rank(comm,&rank);
194:         devId = rank % devCount;
195:         cerr  = cupmSetDevice(devId);CHKERRCUPM(cerr);
196:       }
197:     } else if (device == PETSC_DEFAULT) {
198:       /* Do nothing, i.e., use whatever device set by user before PetscInitialize() */
199:     } else if (device == PETSC_NONE) {
200:       PetscNotUseCUPM = PETSC_TRUE; /* Assert the code won't use devices even there are */
201:     } else SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Wrong device (%D) passed to -device_set <dev>. Must be NONE(-3),PETSC_DEFAULT(-2),PETSC_DECIDE(-1) or a non-negative integer.",device);

203:     if (devCount > 0 && device != PETSC_NONE) {
204:       /* Do costly device handles initialization here to not to distort petsc logging later */
205:       PetscBool useNull = PETSC_TRUE; /* use the default (NULL) stream as petsc's default stream */
206:       PetscOptionsGetBool(NULL,NULL,"-petsc_default_use_null_stream",&useNull,NULL);
207:       if (!useNull) {cerr = cupmStreamCreate(&PetscDefaultCupmStream);CHKERRCUPM(cerr);}
208:       PetscCUPMBLASInitializeHandle();
209:       PetscCUPMSOLVERDnInitializeHandle();
210:       PetscCUPMInitialized = PETSC_TRUE;
211:     }
212:   }
213:   return(0);
214: }

216: /*
217:   The routine works as a driver to initialize and view the device

219:   Input Parameter:
220:     initDevice: True if user explicitly has -cuda/hip_device xxx
221:     device:     Significant when <initDeivce>. Basically, it is the integer presentation of the xxx above
222:     logView:    True if -log_view or -log_summary
223:     devView:    True if -{cuda,hip}_view
224:  */
225: static PetscErrorCode PetscCUPMInitializeAndView(PetscBool initDevice,PetscInt device,PetscBool logView,PetscBool devView)
226: {
227:   PetscErrorCode        ierr;
228:   cupmError_t           cerr;
229:   PetscMPIInt           rank;
230:   int                   devId,devCount;
231:   cupmDeviceProp        prop;

234:   PetscCUPMSynchronize = logView;
235:   if (initDevice) {PetscCUPMInitialize(PETSC_COMM_WORLD,device);}
236:   else if (logView) { /* With -log_view, we want to do costly gpu runtime initialization early so that not to distort the timing later. */
237:     devCount = 0;
238:     cerr = cupmGetDeviceCount(&devCount);
239:     cupmGetLastError(); /* Reset the last error */
240:     if (cerr == cupmSuccess && devCount >= 1) { /* There are devices */
241:       devId = 0;
242:       if (devCount > 1) { /* Decide which device to init when there are multiple */
243:         cerr = cupmSetDeviceFlags(cupmDeviceMapHost);
244:         cupmGetLastError(); /* Reset the last error */
245:         if (cerr == cupmSuccess) { /* It implies gpu runtime has not been initialized */
246:           MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
247:           devId = rank % devCount;
248:           cerr  = cupmSetDevice(devId);CHKERRCUPM(cerr);
249:         } else if (cerr == cupmErrorSetOnActiveProcess) {
250:           /* It means user initialized gpu runtime outside of petsc. We respect the device choice. */
251:           cerr = cupmGetDevice(&devId);CHKERRCUPM(cerr);
252:         }
253:       }
254:       PetscCUPMInitialize(PETSC_COMM_WORLD,(PetscInt)devId);
255:      #if defined(PETSC_HAVE_KOKKOS)
256:       /* With -log_view, we always do eager init */
257:       PetscKokkosInitializeCheck();
258:      #endif
259:     }
260:   }

262:   if (devView) {
263:     MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
264:     cerr = cupmGetDeviceCount(&devCount);CHKERRCUPM(cerr);
265:     for (devId = 0; devId < devCount; ++devId) {
266:       cerr = cupmGetDeviceProperties(&prop,devId);CHKERRCUPM(cerr);
267:       PetscPrintf(PETSC_COMM_WORLD, "device %d: %s\n", devId, prop.name);
268:     }
269:     cerr = cupmGetDevice(&devId);CHKERRCUPM(cerr);
270:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Using device %d.\n",rank,devId);
271:     PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT);
272:   }
273:   return(0);
274: }

276: /*
277:   The routine checks user's device related options and initializes the device if instructed.

279:   Input Parameter:
280:     logView:    True if -log_view or -log_summary
281:  */
282: static PetscErrorCode PetscOptionsCheckCUPM(PetscBool logView)
283: {
285:   PetscBool      initDevice = PETSC_FALSE,devView = PETSC_FALSE,devNone = PETSC_FALSE;
286:   PetscInt       device = 0;
287:   char           devStr[32]={0};
288: #if defined(PETSC_HAVE_KOKKOS)
289:   PetscBool      set,kinited,devDefault;
290: #endif

293: #if defined(PETSC_HAVE_KOKKOS)
294:   PetscKokkosIsInitialized_Private(&kinited);
295:   if (kinited) { /* Check if Petsc device options conform with Kokkos' device if Kokkos is init'ed before PetscInitialize() */
296:     PetscOptionsGetString(NULL,NULL,cupmSetDeviceStr,devStr,sizeof(devStr),&set);
297:     if (set) { /* If users have initialized Kokkos themselves, but also had e.g., -cuda_device XXX, for simplicity, make sure XXX is DEFAULT */
298:       PetscStrcasecmp("DEFAULT",devStr,&devDefault);
299:       if (!devDefault) {PetscStrcasecmp("PETSC_DEFAULT",devStr,&devDefault);}
300:       if (!devDefault) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Kokkos was initialized before PetscInitialize(), but you have %s %s. Remove the option or use %s default.",cupmSetDeviceStr,devStr,cupmSetDeviceStr);
301:     } else { /* If users did not have e.g., '-cuda_device XXX', insert one here so that petsc can continue its own device initialization */
302:       PetscOptionsSetValue(NULL,cupmSetDeviceStr,"DEFAULT");
303:     }
304:   }
305: #endif

307:   PetscOptionsBegin(PETSC_COMM_WORLD,NULL,cupmOptionsStr,"Sys");
308:   PetscOptionsString(cupmSetDeviceStr,NULL,PetscCUPMInitializeStr,devStr,devStr,sizeof(devStr),&initDevice);
309:   PetscStrcasecmp("none",devStr,&devNone);
310:   if (devNone) device = -3; /* -3 is the locally used PETSC_NONE in Petsc{CUDA/HIP}Initialize() */
311:   else {PetscOptionsInt(cupmSetDeviceStr,"Set which MPI ranks to use which devices",PetscCUPMInitializeStr,device,&device,&initDevice);}
312:   PetscOptionsBool(cupmSynchronizeStr,"Wait for the device to complete operations before returning to the CPU (on by default with -log_summary or -log_view)",NULL,PetscCUPMSynchronize,&PetscCUPMSynchronize,NULL);
313:   PetscOptionsName(cupmViewStr,"Display device information and assignments",NULL,&devView);
314:   PetscOptionsEnd();
315:   PetscCUPMInitializeAndView(initDevice,device,logView,devView);
316:   return(0);
317: }