Difference between revisions of "Using petsc (parallel)"
| (One intermediate revision by the same user not shown) | |||
| Line 40: | Line 40: | ||
<tr><td> 2 </td> <td> 2 </td> <td>37</td> </tr> | <tr><td> 2 </td> <td> 2 </td> <td>37</td> </tr> | ||
<tr><td> 4 </td> <td> 4 </td> <td>10</td> </tr> | <tr><td> 4 </td> <td> 4 </td> <td>10</td> </tr> | ||
| − | <tr><td> 8 </td> <td> 4 </td> <td>1.5</td> | + | <tr><td> 8 </td> <td> 4 </td> <td>1.5</td>/tr> |
<tr><td> 12</td> <td> 4 </td> <td>1.2</td> </tr> | <tr><td> 12</td> <td> 4 </td> <td>1.2</td> </tr> | ||
</table> | </table> | ||
| Line 48: | Line 48: | ||
<table border=1> | <table border=1> | ||
| − | <tr><th>nodes</th> <th>cores</th> <th>time 1(s)</th><th>time 2(s)</th>th>time 3(s)</th></tr> | + | <tr><th>nodes</th> <th>cores</th> <th>time 1(s)</th><th>time 2(s)</th><th>time 3(s)</th></tr> |
| − | <tr><td> 1 </td> <td> 8 </td> <td>35</td> <td> </td> <td> </td> </tr> | + | <tr><td> 1 </td> <td> 8 </td> <td>35</td> <td> 94 </td> <td> </td> </tr> |
| − | <tr><td> 2 </td> <td> 8 </td> <td>70</td> <td> </td> <td> </td> </tr> | + | <tr><td> 2 </td> <td> 8 </td> <td>70</td> <td> 63 </td> <td> </td> </tr> |
| − | <tr><td> 3 </td> <td> 8 </td> <td>89</td> <td> </td> <td> </td> </tr> | + | <tr><td> 3 </td> <td> 8 </td> <td>89</td> <td> 11 </td> <td> </td> </tr> |
| − | <tr><td> 4 </td> <td> 8 </td> <td>8.6</td> <td> </td> <td> </td> </tr> | + | <tr><td> 4 </td> <td> 8 </td> <td>8.6</td> <td> 7 </td> <td> </td> </tr> |
| − | <tr><td> 5 </td> <td> 8 </td> <td>5.4</td> <td> </td> <td> </td> </tr> | + | <tr><td> 5 </td> <td> 8 </td> <td>5.4</td> <td> 7 </td> <td> </td> </tr> |
| − | <tr><td> 6 </td> <td> 8 </td> <td>4.5</td> <td> </td> <td> </td> </tr> | + | <tr><td> 6 </td> <td> 8 </td> <td>4.5</td> <td> 6 </td> <td> </td> </tr> |
| − | <tr><td> 7 </td> <td> 8 </td> <td>6.2</td> <td> </td> <td> </td> </tr> | + | <tr><td> 7 </td> <td> 8 </td> <td>6.2</td> <td> 5 </td> <td> </td> </tr> |
| − | <tr><td> 8 </td> <td> 8 </td> <td>4.6</td> <td> </td> <td> </td> </tr> | + | <tr><td> 8 </td> <td> 8 </td> <td>4.6</td> <td> 84</td> <td> </td> </tr> |
</table> | </table> | ||
| + | |||
| + | scaling is occasionally destroyed by long slowdowns (file system?) | ||
<pre> | <pre> | ||
Latest revision as of 14:43, 1 April 2010
And here is a quick parallel code - compile as in using petsc but run like
mpirun -n <N procS> ./a.out <other options>
On my workstation, n = 2 actually results in better performance than n = 1(it has four cores but usually multicore cpus do nothing much for petsc), though not if I bump the problem up to 1024x1024
On bluecrystalp1, running a 1024x1024 problem with -pc_type sor -ksp_type bcgs (with parameters picked to make iterative solution hard), I get
1 node, 1 core : 70s 1 node, 2 cores : 45s 1 node, 4 cores : 32s 2 nodes 4 cores each : 14s 4 nodes, 1 core each : 23s
compared to the 49s required on my workstation with one core. Asking for 4 nodes seems to result in lengthy queuing.
using mumps instead (-pc_type lu -pc_factor mat_solver_package mumps) I get
1 node, 1 core : 77s 1 node, 2 cores: 48s 1 node, 4 cores: 31s 2 node, 1 core each: 47s 2 node, 2 cores each: 29s 2 node, 4 cores each: 26s 4 node, 2 cores each: 26s
Does this suggest that the iterative method scales better across nodes? perhaps not suprising as all it needs to do is matrix vector multiplies, which would mean the number of inteprocess comms growing like O(nCore) only.
results for BC2 & bcgs
/tr>| nodes | cores | time (s) |
|---|---|---|
| 1 | 1 | 67 |
| 1 | 2 | 94 |
| 2 | 2 | 37 |
| 4 | 4 | 10 |
| 8 | 4 | 1.5 |
| 12 | 4 | 1.2 |
however, a number of the jobs submitted, failed, with an openmpi error "assign_context command failed: Device or resource busy" Grabbing all the cores on a node solves this problem (conflict between mpi choice?)
| nodes | cores | time 1(s) | time 2(s) | time 3(s) |
|---|---|---|---|---|
| 1 | 8 | 35 | 94 | |
| 2 | 8 | 70 | 63 | |
| 3 | 8 | 89 | 11 | |
| 4 | 8 | 8.6 | 7 | |
| 5 | 8 | 5.4 | 7 | |
| 6 | 8 | 4.5 | 6 | |
| 7 | 8 | 6.2 | 5 | |
| 8 | 8 | 4.6 | 84 |
scaling is occasionally destroyed by long slowdowns (file system?)
1 node, 4 cores : 30.15s 2 node, 4 core each : 4 nodes, 4 core each : 4.8s 8 nodes, 4 core each : 1.7s
!! solve a 2D n x n (a(x,y)u')' + b(x,y) u = c(x,y)
!! with periodic boundary conditions in y, u(x=0,y) = 0 and u'(x=1,y)= d
!! a(x,y) = 1 for x < 1/2, h for x > 1/2
!! b(x,y) = 1 for x < 1/2, r for x > 1/2
!! c(x,y) = 1 for x < 1/2, p for x > 1/2
program main
implicit none
#include "finclude/petsc.h"
#include "finclude/petscvec.h"
#include "finclude/petscmat.h"
#include "finclude/petscksp.h"
#include "finclude/petscpc.h"
Mat A
Vec x,b, res
KSP ksp
PetscInt ierr, ndof, n, m, i , j, lbegin, lend
PetscScalar dfluxe(0:1), dfluxw(0:1), dfluxn(0:1), dfluxs(0:1), su(0:0), sc, &
h, r, p , d, rnorm, bnorm, one
PetscInt row(0:0), fcol(0:1)
n = 256
m = n/2
ndof = n * n
!x > 1/2 diffusion = h u'
h = 1e-2
!x > 1/2 dissipation = r u , set small r to have problems inverting the matrix
!so that direct solvers win
r = 1e-6
!x > 1/2 source = p
p = 0.5
! u' at x = 1
d = 100
! this has to be the first petsc call
call PetscInitialize(PETSC_NULL_CHARACTER,ierr)
!allocate space for matrix elements
call MatCreateMpiAIJ(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, &
ndof, ndof, 5, PETSC_NULL_INTEGER, 1,PETSC_NULL_INTEGER, A,ierr)
! alocate space for rhs, residual and solution
call MatGetVecs(A, x, b, ierr);
call MatGetVecs(A, x, res, ierr);
!establish which dofs are on this processor
call MatGetOwnerShipRange(A, lbegin, lend, ierr)
! bulk
dfluxe(0) = 1.0
dfluxe(1) = -1.0
su(0) = 1.0
sc = 1.0
do i = 1, n-2
dfluxw(0) = dfluxe(0)
dfluxw(1) = dfluxe(1)
dfluxn(0) = 1.0
dfluxn(1) = -1.0
if (i > m) then
su(0) = r
sc = p
dfluxe(0) = h
dfluxe(1) = -h
dfluxn(0) = h
dfluxn(1) = -h
endif
dfluxs(0) = dfluxn(0)
dfluxs(1) = dfluxn(1)
do j = 1, n-2
row(0) = j*n + i
if ((lbegin .le. row(0)) .and. (lend .gt. row(0))) then
! source term
call MatSetValues(A, 1 ,row, 1, row, su, ADD_VALUES,ierr)
call VecSetValue( b, row(0) , sc , INSERT_VALUES, ierr)
! fluxes
fcol(0) = row(0)
fcol(1) = row(0) + 1
call MatSetValues(A, 1 ,row, 2, fcol, dfluxe, ADD_VALUES,ierr)
fcol(1) = row(0) - 1
call MatSetValues(A, 1 ,row, 2, fcol, dfluxw, ADD_VALUES,ierr)
fcol(1) = row(0) + n
call MatSetValues(A, 1 ,row, 2, fcol, dfluxn, ADD_VALUES,ierr)
fcol(1) = row(0) - n
call MatSetValues(A, 1 ,row, 2, fcol, dfluxs, ADD_VALUES,ierr)
end if
end do
! periodic boundaries
j = 0
row(0) = j*n + i
if ((lbegin .le. row(0)) .and. (lend .gt. row(0))) then
! source term
call MatSetValues(A, 1 ,row, 1, row, su, ADD_VALUES,ierr)
call VecSetValue( b, row(0) , sc , INSERT_VALUES, ierr)
! fluxes
fcol(0) = row(0)
fcol(1) = row(0) + 1
call MatSetValues(A, 1 ,row, 2, fcol, dfluxe, ADD_VALUES,ierr)
fcol(1) = row(0) - 1
call MatSetValues(A, 1 ,row, 2, fcol, dfluxw, ADD_VALUES,ierr)
fcol(1) = row(0) + n
call MatSetValues(A, 1 ,row, 2, fcol, dfluxn, ADD_VALUES,ierr)
fcol(1) = (n-1)*n + i
call MatSetValues(A, 1 ,row, 2, fcol, dfluxs, ADD_VALUES,ierr)
end if
j = n-1
row(0) = j*n + i
if ((lbegin .le. row(0)) .and. (lend .gt. row(0))) then
! source term
call MatSetValues(A, 1 ,row, 1, row, su, ADD_VALUES,ierr)
call VecSetValue( b, row(0) , sc , INSERT_VALUES, ierr)
! fluxes
fcol(0) = row(0)
fcol(1) = row(0) + 1
call MatSetValues(A, 1 ,row, 2, fcol, dfluxe, ADD_VALUES,ierr)
fcol(1) = row(0) - 1
call MatSetValues(A, 1 ,row, 2, fcol, dfluxw, ADD_VALUES,ierr)
fcol(1) = row(0) - n
call MatSetValues(A, 1 ,row, 2, fcol, dfluxn, ADD_VALUES,ierr)
fcol(1) = i
call MatSetValues(A, 1 ,row, 2, fcol, dfluxs, ADD_VALUES,ierr)
end if
end do
!x-boundaries
dfluxw(0) = h
dfluxw(1) = -h
do j = 0, n - 1
i = 0
row(0) = j*n + i
if ((lbegin .le. row(0)) .and. (lend .gt. row(0))) then
call MatSetValues(A, 1 ,row, 1, row, su, ADD_VALUES,ierr)
end if
i = n - 1
row(0) = j*n + i
if ((lbegin .le. row(0)) .and. (lend .gt. row(0))) then
su = r
sc = h*d
call MatSetValues(A, 1 ,row, 1, row, su, ADD_VALUES,ierr)
call VecSetValue( b, row(0) , sc , INSERT_VALUES, ierr)
fcol(0) = row(0)
fcol(1) = row(0) - 1
call MatSetValues(A, 1 ,row, 2, fcol, dfluxw, ADD_VALUES,ierr)
end if
end do
!needs to be done before A can be used
call MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY,ierr)
call MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY,ierr)
!KSP context: KSP is petsc's interface to linear solvers
call KSPCreate(PETSC_COMM_WORLD,ksp,ierr)
!A is defines both the operator and preconditioner (but might not)
call KSPSetOperators(ksp,A,A,DIFFERENT_NONZERO_PATTERN,ierr)
!grab all the options from the command line
call KSPSetFromOptions(ksp,ierr)
! solve
call KSPSolve(ksp,b,x,ierr)
!check results
call MatMult(A, x, res, ierr)
one = -1.0
call VecAXPY(res , one , b, ierr)
call VecNorm(res , NORM_2, rnorm, ierr)
call VecNorm(b , NORM_2, bnorm, ierr)
write(*,*) "||Ax - b||/||b|| = ", rnorm/bnorm
!clean up
call VecDestroy(x,ierr)
call VecDestroy(b,ierr)
call MatDestroy(A,ierr)
!last petsc call
call PetscFinalize(ierr)
end program main