Code optimization

Hello @damianmoz and @annacfs, wrt parallelizing the SOR method, one possible option is to use a "checker-board" approach as described on this page: CS267: Notes for Lectures 15 and 16, Mar 5 and 7, 1996.

The basic idea is to split your array into two groups of non-adjacent elements (in 2D, this ends up looking like the squares of a checkerboard). Then, the new values of each group can be computed in parallel, because they don't rely on the values of the neighboring elements.

I've put together a modified version of your code that uses this approach. It also makes some small adjustments to the array initialization, and simplifies the calc_erro proc by doing a maximum reduction at the same time as the computation.

I can't be 100% sure that this will match results from your original Fortran code, but hopefully it's a helpful start (at least conceptually) for parallelizing that loop:


use Time;
var runtime: Timer;

// set up domains
config const imax, jmax, kmax = 10;
const domPoisson = {1..imax, 1..jmax, 1..kmax},     // the full domain
      dpInner = domPoisson[domPoisson.expand(-1)];  // the full domain, not including boundaries

// set up arrays
var f, tf : [domPoisson] real = 0.0;

// set up other parameters
const Lx = 1.0, Ly = 1.0,  Lz = 1.0 : real;
const dx = Lx/imax, dy = Ly/jmax, dz = Lz/kmax : real;
const w  = 1.3 : real;
const dx2 = dx*dx, dy2 = dy*dy, dz2 = dz*dz: real;
const beta1 = dx2/dy2,
      beta2 = dx2/dz2 : real;
const tol = 1E-6: real;
const NPRIN = 500;

var i, j, k: int;
var itera: int = 0;
var erro : real = 1E0;
var max_f, min_f : real;

// create arrays of x,y,z coordinates so that we don't have to recompute them during iteration:
const x_coords = [i in domPoisson.dim(0)] (i - 0.5) * dx,
      y_coords = [j in domPoisson.dim(1)] (j - 0.5) * dy,
      z_coords = [k in domPoisson.dim(2)] (k - 0.5) * dz;

// initialize source term:
forall (i, j, k) in domPoisson do
    tf[i, j, k] = -3.0 * sin(x_coords[i]) * cos(y_coords[j]) * sin(z_coords[k]);

// initialize boundary conditions:
forall j in domPoisson.dim(1) {
    for k in domPoisson.dim(2) {
        f(1, j, k) = sin(0.5) * cos(y_coords[j]) * sin(z_coords[k]);
        f(imax, j, k) = sin(x_coords[imax]) * cos(y_coords[j]) * sin(z_coords[k]);
    }
}

forall i in domPoisson.dim(0) {
    for k in domPoisson.dim(2) {
        f(i, 1, k) = sin(x_coords[i]) * cos(0.5) * sin(z_coords[k]);
        f(i, jmax, k) = sin(x_coords[i]) * cos(y_coords[jmax]) * sin(z_coords[k]);
    }
}

forall i in domPoisson.dim(0) {
    for j in domPoisson.dim(1) {
        f(i, j, 1) = sin(x_coords[1]) * cos(y_coords[j]) * sin(0.5);
        f(i, j, kmax) = sin(x_coords[1]) * cos(y_coords[j]) * sin(z_coords[kmax]);
    }
}

// temporary copy of 'f'
var f_next = f;

runtime.start();
while erro  >= tol {
    // compute 'f_next'
    sor_parallel(f_next, f, tf, dx2, beta1, beta2, w);

    // swap 'f_next' into 'f' for the next iteration
    f <=> f_next;

    // compute the error residual
    erro = calc_erro(f, tf, dx2, dy2, dz2);

    itera += 1;
    if (mod(itera,NPRIN) == 0) {
        writef('%i %6.12dr\n', itera, erro);
    }
}
runtime.stop();

escreve(f, imax,  jmax, kmax, dx, dy, dz);

max_f = max reduce(f);
min_f = min reduce(f);

writef("%12.8er   %12.8er \n", max_f, min_f);
writef(" CPU TIME CHAPEL  = %10.16dr \n", runtime.elapsed());
writef("%i   %12.8er \n",itera, erro);
writef("------------------------------------------------------------------------------ \n");

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

proc sor_parallel(ref f_next, const ref f, const ref tf, dx2: real, beta1: real, beta2: real, w: real) {

    // update the "red spaces" in the checkerboard domain in parallel
    forall (i, j, k) in dpInner {
        if ((i+j)%2 == 0 && k%2 == 0) || ((i+j)%2 != 0 && k%2 != 0) {
            f_next(i,j,k) = (1.0 - w) * f(i, j, k) + w *
                    (-dx2*tf(i,j,k) + f(i-1,j,k) + f(i+1,j,k) +
                    beta1*( f(i,j-1,k) + f(i,j+1,k) ) +
                    beta2*( f(i,j,k-1) + f(i,j,k+1) )) /
                    ( 2.0*(beta1 + beta2 + 1.0) );
        }
    }

    // update the "black spaces" in the checkerboard domain in parallel
    forall (i, j, k) in dpInner {
        if ((i+j)%2 != 0 && k%2 == 0) || ((i+j)%2 == 0 && k%2 != 0) {
            f_next(i,j,k) = (1.0 - w) * f(i, j, k) + w *
                    (-dx2*tf(i,j,k) + f_next(i-1,j,k) + f_next(i+1,j,k) +
                    beta1*( f_next(i,j-1,k) + f_next(i,j+1,k) ) +
                    beta2*( f_next(i,j,k-1) + f_next(i,j,k+1) )) /
                    ( 2.0*(beta1 + beta2 + 1.0) );
        }
    }
}

proc calc_erro(f, tf, dx2: real, dy2: real, dz2: real): real {
    // do a min-reduction during error calculation,
    //      so that results don't have to be stored in 'residuo'
    var max_error: real;
    forall (i, j, k) in dpInner with (max reduce max_error) {
        max_error reduce= abs(tf(i,j,k) -
            (f(i-1,j,k) - 2.0*f(i,j,k) + f(i+1,j,k))/dx2 -
            (f(i,j-1,k) - 2.0*f(i,j,k) + f(i,j+1,k))/dy2 -
            (f(i,j,k-1) - 2.0*f(i,j,k) + f(i,j,k+1))/dz2);
    }
    return max_error;
}

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

proc escreve(f, imax: int, jmax: int, kmax: int, dx: real, dy: real, dz: real){
    use IO;
	var erro: real;
    const saida = openwriter("saida_chapel_par.csv");

    saida.writef("  %s  %s   %s  %s  %s  %s  %s  %s %s \n",  "x", ",", "y", ",", "z", ",", "f", ",", "e");
    for (i, j, k) in domPoisson {
        erro = abs( f(i, j, k) - (sin(x_coords[i]) * cos(y_coords[j]) * sin(z_coords[k])) );
        saida.writef("%10.12dr  %s %10.12dr   %s  %10.12dr  %s  %10.12dr  %s  %10.12dr  \n",
        		            x_coords[i], ",", y_coords[j], ",", z_coords[k], ",", f(i,j,k), ",", erro);
    }

    saida.close();
}