Jean Zay: The PETSc library

Introduction

PETSc is a library of functions written in C (but also offering a Fortran interface) allowing the handling of dense and sparse vectors and matrices, and resolution of the corresponding linear systems (often linked to partial differential equations) with direct or iterative solvers. Additionally, PETSc offers non-linear solvers, methods for solving differential equations and many other tools. Finally, PETSc also allows using external libraries.

PETSc is available for CPU and GPU.

Installed versions

Different versions are installed on Jean Zay and can be accessed by module. They can be liste with the command

$ module avail petsc

Additional information on a particular version can be obtaines with the command

$ module show petsc-xxx

For example

$ module show petsc/3.21.4-mpi-complex-int64
-------------------------------------------------------------------
/lustre/fshomisc/sup/hpe/pub/modules-idris-env4/modulefiles/linux-rhel9-skylake_avx512/petsc/3.21.4-mpi-complex-int64:
 
module-whatis   {PETSc is a suite of data structures and routines for the scalable (parallel) solution of scientific applications modeled by partial differential equations. }
prereq          intel-compilers/19.1.3 intel-compilers/2021.9.0
prereq          intel-mpi/2019.9
conflict        petsc
 
[...]
setenv          PETSC_DIR /gpfslocalsup/spack_soft/petsc/3.21.4/intel-19.1.3-mdaguppg7rrc6hqhpvgaopqhv2guj2nj
[...]
 
Available software environment(s):
- intel-compilers/19.1.3 intel-mpi/2019.9
- intel-compilers/2021.9.0 intel-mpi/2021.9
 
If you want to use this module with another software environment,
please contact the support team.
-------------------------------------------------------------------

This installation is available in two different environnements, the first (intel-compilers/19.1.3 intel-mpi/2019.9 here) being the default. If you want one of the following environnements, you must load it beforehand, for example here

$ module load intel-compilers/2021.9.0 intel-mpi/2021.9 petsc

The options used for PETSc compilation can be found in the file $PETSC_DIR/include/petscconfiginfo.h and the installed options in $PETSC_DIR/include/petscconf.h, with in our example $PETSC_DIR to be repalced by /gpfslocalsup/spack_soft/petsc/3.21.4/intel-19.1.3-mdaguppg7rrc6hqhpvgaopqhv2guj2nj/include/petscconf.h .

It is also possible to add complementary external libraries to PETSc. To do this, contact the User Support Team.

Documentation

Usage examples

In this example, we use an exercise based on the DMDA subclass of DM, which manages communication between PETSc algebraic structures and mesh data structures. This exercise is part of our PETSc training program. You can find more details in the course material available at the following address: IDRIS PETSC Training

  • Source code in C and compilation file:

dmda.c
#include <petscdmda.h>
#include <petscksp.h>
#include <petsctime.h>
 
int main(int argc, char **argv)
{
  const PetscInt nx = 10, ny = 10, stencil_size = 5;
  PetscInt i, j, its;
  Mat A;
  Vec x, b;
  KSP solver;
  const PetscReal rtol = 1.e-8;
  KSPConvergedReason reason;
  PetscReal errorNorm, rnorm;
  PetscLogDouble t1, t2;
  DM dm;
  DMDALocalInfo info;
  const PetscInt stencilWidth = 1;
  MatStencil row, col5[stencil_size];
  PetscScalar hx2, hy2, coef, coef5[stencil_size];
  PetscScalar **bgrid;
 
  PetscFunctionBeginUser;
  PetscCall(PetscInitialize(&argc, &argv, NULL, NULL));
  // Create the 2-D DMDA object
  PetscCall(DMDACreate2d(PETSC_COMM_WORLD, DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, DMDA_STENCIL_STAR,
                      nx, ny, PETSC_DECIDE, PETSC_DECIDE, 1, stencilWidth,
                      NULL, NULL, &dm));
 
  PetscCall(DMSetFromOptions(dm));
  PetscCall(DMSetUp(dm));
 
  // View the DMDA object
  PetscCall(DMView(dm, PETSC_VIEWER_STDOUT_WORLD));
 
  // Create the A matrix from the DMDA object
  PetscCall(DMCreateMatrix(dm, &A));
 
  // Retrieve local information from the DMDA object
  PetscCall(DMDAGetLocalInfo(dm, &info));
 
  hx2 = 1.0 / ((info.mx - 1) * (info.mx - 1));
  hy2 = 1.0 / ((info.my - 1) * (info.my - 1));
 
  coef = 1.0;
  coef5[0] = 2.0 / hx2 + 2.0 / hy2;
  coef5[1] = -1.0 / hx2; coef5[2] = -1.0 / hx2;
  coef5[3] = -1.0 / hy2; coef5[4] = -1.0 / hy2;
 
  // Loop on the grid points
  for (j = info.ys; j < info.ys + info.ym; j++) {
 
    for (i = info.xs; i < info.xs + info.xm; i++) {
      row.i = i; row.j = j; row.c = 0;
 
      if (i == 0 || i == (info.mx - 1) || j == 0 || j == (info.my - 1)) {
        // Set matrix values to enforce boundary conditions (homogeneous Dirichlet conditions)
        PetscCall(MatSetValuesStencil(A, 1, &row, 1, &row, &coef, INSERT_VALUES));
      } else {
        // Set matrix values fo interior points
        col5[0].i = i; col5[0].j = j; col5[0].c = 0;
        col5[1].i = i - 1; col5[1].j = j; col5[1].c = 0;
        col5[2].i = i + 1; col5[2].j = j; col5[2].c = 0;
        col5[3].i = i; col5[3].j = j - 1; col5[3].c = 0;
        col5[4].i = i; col5[4].j = j + 1; col5[4].c = 0;
        PetscCall(MatSetValuesStencil(A, 1, &row, stencil_size, col5, coef5, INSERT_VALUES));
      }
    }
  }
  PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
  PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
  // PetscCall(MatView(A, PETSC_VIEWER_STDOUT_WORLD));
 
  // Create global vectors b and x from the DMDA object
  PetscCall(DMCreateGlobalVector(dm, &b));
  PetscCall(DMCreateGlobalVector(dm, &x));
 
  PetscCall(VecSet(b, 0.0));
  PetscCall(VecSetRandom(x, NULL));
 
  PetscCall(KSPCreate(PETSC_COMM_WORLD, &solver));
  PetscCall(KSPSetOperators(solver, A, A));
  PetscCall(KSPSetTolerances(solver, rtol, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT));
  PetscCall(KSPSetInitialGuessNonzero(solver, PETSC_TRUE));
  PetscCall(KSPSetFromOptions(solver));
  PetscCall(KSPSetUp(solver));
 
  PetscCall(KSPView(solver, PETSC_VIEWER_STDOUT_WORLD));
 
  PetscCall(PetscTime(&t1));
  PetscCall(KSPSolve(solver, b, x));
  PetscCall(PetscTime(&t2));
 
  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Elapse time: %lf s\n", t2 - t1));
 
  PetscCall(KSPGetConvergedReason(solver, &reason));
  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Converged reason: %s\n", KSPConvergedReasons[reason]));
 
  PetscCall(KSPGetIterationNumber(solver, &its));
  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Iterations number: %d\n", its));
 
  PetscCall(KSPGetResidualNorm(solver, &rnorm));
  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Residual norm: %g\n", rnorm));
 
  PetscCall(VecNorm(x, NORM_2, &errorNorm));
  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Error norm (NORM_2): %g\n", errorNorm));
  PetscCall(VecNorm(x, NORM_INFINITY, &errorNorm));
  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Error norm (NORM_INFINITY): %g\n", errorNorm));
 
  PetscCall(KSPDestroy(&solver));
  PetscCall(MatDestroy(&A));
  PetscCall(VecDestroy(&x));
  PetscCall(VecDestroy(&b));
  PetscCall(DMDestroy(&dm));
 
  PetscCall(PetscFinalize());
 
  return 0;
}
Makefile
ALL: dmda
 
SOURCES    = dmda.c
OBJ        = $(SOURCES:.c=.o)
EXE        = dmda.exe
CLEANFILES = ${OBJ} ${EXE}
 
include ${PETSC_DIR}/lib/petsc/conf/variables
include ${PETSC_DIR}/lib/petsc/conf/rules
 
dmda: ${OBJ}
	${CLINKER} -o ${EXE} ${OBJ} ${PETSC_LIB}

CPU Example

For the CPU case, compilation is performed after loading the module petsc/3.20.1-mpi, for example. The job is then executed using the following submission script:

jobslurm_cpu.sh
#!/bin/bash
#SBATCH --job-name=dmda_cpu        # name of job
#SBATCH --nodes=1
#SBATCH --ntasks=4                 # name of the MPI process
#SBATCH --hint=nomultithread       # 1 thread per physical core (no hyperthreading)
#SBATCH --time=00:10:00            # maximum execution time requested (HH:MM:SS)
#SBATCH --output=dmda_cpu_%j.out  # name of output file
#SBATCH --error=dmda_cpu_%j.out   # name of error file (here, common with the output file)
#SBATCH --qos=qos_cpu-dev
 
cd ${SLURM_SUBMIT_DIR}
 
module purge
module load petsc/3.20.1-mpi
module list
 
view_args="-log_view" # to avoid slowing down due to -log_view_gpu_time
 
cpu_args="-dm_mat_type mpiaij -dm_vec_type mpi"
 
set -x
srun ./dmda.exe -da_grid_x 1000 -da_grid_y 1000 -ksp_type cg $cpu_args $view_args

GPU Example

The GPU version of PETSc, using CUDA, is installed on all our GPU partitions: V100, A100, and H100. The corresponding modules are available under the name petsc/3.22.1-mpi-cuda (for the A100 and H100 partitions, you must first load the module module load arch/a100 or module load arch/h100, respectively). Note: Due to compatibility issues between the front-end/compilation nodes and the H100 partition, you must currently compile your PETSc code directly on H100 nodes to generate an executable compatible with this partition.

On Jean-Zay, CUDA-aware OpenMPI libraries support GPUDirect, which optimizes communications between GPUs. For more information, please consult the documentation MPI CUDA-aware and GPUDirect.

In this example, we take advantage of GPUDirect for optimal performance in a multi-GPU case. This requires adapting the dmda.c code presented in the previous section. Below are the necessary modifications:

  • Addition of the binding step: Include init_cuda.c in the SOURCES section of the Makefile.
init_cuda.c
    #include <cuda.h>
    #include <cuda_runtime.h>
 
    void initialisation_cuda()
    {
        char* local_rank_env;
        int local_rank;
        cudaError_t cudaRet;
 
         /* Récupération du rang local du processus via la variable d'environnement
            positionnée par Slurm, l'utilisation de MPI_Comm_rank n'étant pas encore
            possible puisque cette routine est utilisée AVANT l'initialisation de MPI */
        local_rank_env = getenv("SLURM_LOCALID");
 
        if (local_rank_env) {
            local_rank = atoi(local_rank_env);
            /* Définition du GPU à utiliser pour chaque processus MPI */
            cudaRet = cudaSetDevice(local_rank);
            if(cudaRet != CUDA_SUCCESS) {
                printf("Erreur: cudaSetDevice a échoué\n");
                exit(1);
            }
        } else {
            printf("Erreur : impossible de déterminer le rang local du processus\n");
            exit(1);
        }
    }
  • Initialization of CUDA before MPI.
dmda_gpu.c
/* On initialise CUDA... */
initialisation_cuda();
 
PetscFunctionBeginUser;
PetscCall(PetscInitialize(&argc, &argv, NULL, NULL));

Here is a submission script adapted to run this exercise on A100 GPUs.

jobslurm_a100.sh
#!/bin/bash
#SBATCH --job-name=dmda_gpu        # name of job
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2          # number of MPI tasks per node
#SBATCH --gres=gpu:2                 # number of GPUs per node
#SBATCH --cpus-per-task=8         # number of cores per task for gpu_p5 (1/8 of 8-GPUs A100 node)
#SBATCH --hint=nomultithread       # hyperthreading is deactivated
#SBATCH --time=00:10:00            # maximum execution time requested (HH:MM:SS)
#SBATCH --output=dmda_gpu_a100%j.out  # name of output file
#SBATCH --error=dmda_gpu_a100%j.out   # name of error file (here, in common with the output file)
#SBATCH --qos=qos_gpu_a100-dev
#SBATCH -C a100
 
cd ${SLURM_SUBMIT_DIR}
 
module purge
module load arch/a100
module load petsc/3.22.1-mpi-cuda
module list
 
view_args="-log_view" # to avoid slowing down due to -log_view_gpu_time
 
if [ $SLURM_NTASKS == 1]
then
  ### mono-GPU :
  gpu_args="-dm_mat_type aijcusparse -dm_vec_type cuda"
else
  ### multi-GPU :
  gpu_args="-dm_mat_type mpiaijcusparse -dm_vec_type mpicuda"
fi
 
set -x
srun ./dmda.exe -da_grid_x 1000 -da_grid_y 1000 -ksp_type cg -pc_type hypre $gpu_args $view_args