Code optimization

Hello guys,

I'm writing code to solve the navier stokes equations but the poisson problem isn't scaled as I'd like. Can anyone help me optimize the code?

Attached is a brief description and the code: PDF description

I'm using single locale;

CHPL_HOST_PLATFORM: linux64
CHPL_HOST_COMPILER: gnu
CHPL_HOST_CC: gcc
CHPL_HOST_CXX: g++
CHPL_HOST_ARCH: x86_64
CHPL_TARGET_PLATFORM: linux64
CHPL_TARGET_COMPILER: gnu
CHPL_TARGET_CC: gcc
CHPL_TARGET_CXX: g++
CHPL_TARGET_ARCH: x86_64
CHPL_TARGET_CPU: native
CHPL_LOCALE_MODEL: flat
CHPL_GPU_CODEGEN: none
CHPL_COMM: none
CHPL_TASKS: fifo *
CHPL_LAUNCHER: none
CHPL_TIMERS: generic
CHPL_UNWIND: none
CHPL_HOST_MEM: cstdlib
CHPL_MEM: cstdlib *
CHPL_ATOMICS: cstdlib
CHPL_GMP: none
CHPL_HWLOC: none *
CHPL_RE2: none
CHPL_LLVM: unset
CHPL_LLVM_CONFIG: none
CHPL_LLVM_VERSION: 0
CHPL_AUX_FILESYS: none
CHPL_LIB_PIC: none
CHPL_SANITIZE: none
CHPL_SANITIZE_EXE: none

module load chapel/1.27.0

source /modules/apps/chapel/chapel-1.27.0/util/setchplenv.bash && export CHPL_MEM=cstdlib CHPL_HWLOC=none CHPL_TASKS=fifo CHPL_RT_NUM_THREADS_PER_LOCALE=2

cd /lustre/anna2/poisson/chapel/grid130/

chpl --fast eliptico3D_coforall.chpl

time ./eliptico3D_coforall --imax=130 --jmax=130 --kmax=130 --npx=1 --npy=1 --npz=2 > coforall_130_112

chpl --fast eliptico3D_forall.chpl

time ./eliptico3D_forall --imax=130 --jmax=130 --kmax=130 > forall_130_2

Serial

// Finite Difference Solver for the Poisson equation (3D)  
// Author: JESUS, Anna  && GRION, Livia   
// Last Modification: 20/11/2022
// Serial version
 
	use Time;
	var runtime: Timer;
	
	config const imax, jmax, kmax : int;	 
	const Lx = 1.0, Ly = 1.0,  Lz = 1.0 : real;
	const dx = Lx/imax, dy = Ly/jmax, dz = Lz/kmax : real;      
	const w  = 1.3 : real;										
	const dx2 = dx*dx, dy2 = dy*dy, dz2 = dz*dz: real;
	const beta1 = dx2/dy2 : real;	
	const beta2 = dx2/dz2 : real;	 
	const tol = 1E-6: real;
    const NPRIN = 500: int;
    
	var f, tf, residuo : [1..imax, 1..jmax, 1..kmax] real; 	  
	var i, j, k: int;
	var itera: int;
    var x, y, z, erro : real;
    var max_f, min_f : real;
    
 	runtime.start(); 
 	
 	serial{
    itera = 0;
    residuo = 0.0; 
    

	  for k in 1..kmax  do{    
	   z =  (k-0.5)*dz ;          
		
		for j in 1..jmax  do{
		  y =  (j-0.5)*dy;
		  
			for i in 1..imax  do{ 
			    x =  (i-0.5)*dx;	       		           		    
     		    
                tf(i,j,k) = -3.0*sin(x)*cos(y)*sin(z);
     		    
     		    }
     		}
        } 
        
            // Direcao - x
     		for j in 1..jmax do{     		   
     		    y  = (j-0.5)*dy;       		    
     		        for k in 1..kmax do{
     		        z = (k-0.5)*dz;     		    
     		       
     		        x = 0.5;
                    f(1,j,k) = sin(x)*cos(y)*sin(z);
                
                    x = (imax-0.5)*dx;
                    f(imax,j,k) = sin(x)*cos(y)*sin(z);
     		         
     		    }
     	    }
     		    
     		     
            // Direcao - y 
	        for i in 1..imax do{ 
	            x = (i-0.5)*dx;
     		    
     		    for k in 1..kmax do{
     		        z = (k-0.5)*dy;
     		    
     		        y = 0.5;
                    f(i,1,k) = sin(x)*cos(y)*sin(z); 
                    
                    y = (jmax-0.5)*dy;
                    f(i,jmax,k) = sin(x)*cos(y)*sin(z);  
     		     
     		    }
     		 }
     		     
         // Direcao - z 	 	   
	     for i in 1..imax do{
	        x  = (i-0.5)*dx; 	       
     		   
     		   for j in 1..jmax do{ 
     		   y = (j-0.5)*dy; 
     		    
     		   z = 0.5;          
               f(i,j,1) = sin(x)*cos(y)*sin(z);
                
               z = (kmax-0.5)*dz ;        
               f(i,j,kmax) = sin(x)*cos(y)*sin(z);  
     		    
     		     
     		 }
        }
     		     
		erro = 1E0;
		itera = 0;
  
		    while (erro  >= tol) do{    
            
		    f = sor_method(f, tf, imax, jmax, kmax, dx2, beta1, beta2, w);		 			 		 
			residuo  = calc_erro(f, tf, imax, jmax, kmax, dx2, dy2, dz2);   

            erro = max reduce(residuo); 

            itera += 1;	  
					
		    
		        if (mod(itera,NPRIN) == 0) then{		  		
		            writef('%i %6.12dr\n', itera, erro); 
		        } 
  
            }
        } // fim serial
 		 
 		runtime.stop();
 		 	
   		escreve(f, imax,  jmax, kmax, dx, dy, dz);
   		
 
        max_f = max reduce(f); 
        min_f = min reduce(f);     

	writef("%12.8er   %12.8er \n", max_f, min_f);				                 				     
    writef(" CPU TIME CHAPEL  = %10.16dr \n",runtime.elapsed());
	writef("%i   %12.8er \n",itera, erro);						                 
	writef("------------------------------------------------------------------------------ \n");  
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	proc sor_method(f, tf, imax: int, jmax: int,  kmax: int, dx2: real, beta1: real,  beta2: real,  w: real){  
	
     for i in 2..imax-1  do{
     		for j in 2..jmax-1  do{
     		    for k in 2..kmax-1  do{
     		  
                    f(i,j,k) =  (1.0 - w)*f(i,j,k) +  
                                w * (-dx2*tf(i,j,k) + f(i-1,j,k) + f(i+1,j,k) +
                                beta1*( f(i,j-1,k) + f(i,j+1,k) ) + 
                                beta2*( f(i,j,k-1) + f(i,j,k+1) ))/ ( 2.0*(beta1 + beta2 + 1.0) ) ;
            }   
        }   
    }   
 
		return f;     
       
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 

	proc calc_erro(f, tf, imax: int, jmax: int,  kmax: int,  dx2: real, dy2: real, dz2: real){

	var errol : [1..imax,1..jmax, 1..kmax] real; 
	errol = 0.0;

	 for i in 2..imax-1  do{
     		for j in 2..jmax-1  do{
     		    for k in 2..kmax-1  do{
		 
      		        errol(i,j,k) =  abs(tf(i,j,k) -  
			                               (f(i-1,j,k) - 2.0*f(i,j,k) + f(i+1,j,k))/dx2 -
			                               (f(i,j-1,k) - 2.0*f(i,j,k) + f(i,j+1,k))/dy2 -
			                               (f(i,j,k-1) - 2.0*f(i,j,k) + f(i,j,k+1))/dz2);   
 		    }
  	    }
  	}
     		return errol;

}

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 
    proc escreve(f, imax: int, jmax: int, kmax: int, dx: real, dy: real, dz: real){  
 
    use IO;                  
	var erro: [1..imax,1..jmax, 1..kmax] real; 
	erro = 0.0;
    const saida = openwriter("saida_serial.csv");  
   
    saida.writef("  %s  %s   %s  %s  %s  %s  %s  %s %s \n",  "x", ",", "y", ",", "z", ",", "f", ",", "e");
	for k in 1..kmax  do{    
	   z =  (k-0.5)*dz ;          
		
		for j in 1..jmax  do{
		  y =  (j-0.5)*dy;
		  
			for i in 1..imax  do{ 
			    x =  (i-0.5)*dx;
			            
			    erro(i,j,k) = abs(f(i,j,k) - sin(x)*cos(y)*sin(z));
			            
        		saida.writef("%10.12dr  %s %10.12dr   %s  %10.12dr  %s  %10.12dr  %s  %10.12dr  \n", 
        		            x, ",", y, ",", z, ",", f(i,j,k), ",", erro(i,j,k));
        	 }
 
		  }
    }
saida.close();
 
 }
 

Coforall

// Finite Difference Solver for the Poisson equation (3D)  
// Author: JESUS, Anna  && GRION, Livia   
// Last Modification: 20/11/2022
// Serial version
 
	use Time;
	var runtime: Timer;
	
	config const imax, jmax, kmax : int;	 
    config const npx, npy, npz: int;	  
    const ptsx = (imax-2)/npx, ptsy = (jmax-2)/npy, ptsz = (kmax-2)/npz : int;    
	const Lx = 1.0, Ly = 1.0,  Lz = 1.0 : real;
	const dx = Lx/imax, dy = Ly/jmax, dz = Lz/kmax : real;      
	const w  = 1.3 : real;										
	const dx2 = dx*dx, dy2 = dy*dy, dz2 = dz*dz: real;
	const beta1 = dx2/dy2 : real;	
	const beta2 = dx2/dz2 : real;	 
	const tol = 1E-6: real;
    const NPRIN = 500;
    const Inner = {2..imax-1, 2..jmax-1,  2..kmax-1};
	
	var f, tf, residuo : [1..imax, 1..jmax, 1..kmax] real; 	  
 
	var i, j, k: int;
	var itera: int;
    var x, y, z, erro : real;
    var max_f, min_f : real;
    
 	runtime.start(); 
 	
    itera = 0;
    residuo = 0.0;
 
	  for k in 1..kmax  do{    
	   z =  (k-0.5)*dz ;          
		
		for j in 1..jmax  do{
		  y =  (j-0.5)*dy;
		  
			for i in 1..imax  do{ 
			    x =  (i-0.5)*dx;
     		    
                tf(i,j,k) = -3.0*sin(x)*cos(y)*sin(z);
     		    
     		    }
     		}
       }
        
            // Direcao - x
     		for j in 1..jmax do{     		   
     		    y  = (j-0.5)*dy;       		    
     		        for k in 1..kmax do{
     		        z = (k-0.5)*dz;     		    
     		       
     		        x = 0.5;
                    f(1,j,k) = sin(x)*cos(y)*sin(z);
                
                    x = (imax-0.5)*dx;
                    f(imax,j,k) = sin(x)*cos(y)*sin(z);
     		         
     		    }
     	    }
     		     
            // Direcao - y 
	        for i in 1..imax do{ 
	            x = (i-0.5)*dx;
     		    
     		    for k in 1..kmax do{
     		        z = (k-0.5)*dy;
     		    
     		        y = 0.5;
                    f(i,1,k) = sin(x)*cos(y)*sin(z); 
                    
                    y = (jmax-0.5)*dy;
                    f(i,jmax,k) = sin(x)*cos(y)*sin(z);  
     		     
     		    }
     		 }
     		     
         // Direcao - z  	   
	     for i in 1..imax do{
	        x  = (i-0.5)*dx; 	       
     		   
     		   for j in 1..jmax do{ 
     		   y = (j-0.5)*dy; 
     		    
     		   z = 0.5;          
               f(i,j,1) = sin(x)*cos(y)*sin(z);
                
               z = (kmax-0.5)*dz ;        
               f(i,j,kmax) = sin(x)*cos(y)*sin(z);  
     		    
     		     
     		 }
        }
	 
		erro = 1E0;
		itera = 0;
 
		    while (erro  >= tol) do{    
            
		    f = sor_method(f, tf, imax, jmax, kmax, dx2, beta1, beta2, w, ptsx, ptsy, ptsz);		 			 		 
			residuo  = calc_erro(f, tf, imax, jmax, kmax, dx2, dy2, dz2, ptsx, ptsy, ptsz);   

            erro = max reduce(residuo); 

            itera += 1;	  
							    
		   if (mod(itera,NPRIN) == 0) then{		  		
		    writef('%i %6.12dr\n', itera, erro); 
		   } 
  
        }
 		 
 		runtime.stop();
 		 	
   	    escreve(f, imax,  jmax, kmax, dx, dy, dz); 
    
    max_f = max reduce(f); 
    min_f = min reduce(f); 
    
    writef("%12.8er   %12.8er \n", max_f, min_f);				                 				     
    writef(" CPU TIME CHAPEL  = %10.16dr \n",runtime.elapsed());
	writef("%i   %12.8er \n",itera, erro);						                 
	writef("------------------------------------------------------------------------------ \n");  
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	proc sor_method(f, tf, imax: int, jmax: int,  kmax: int, dx2: real, beta1: real,  beta2: real,  w: real, ptsx: int, ptsy: int,  ptsz: int){ 
  
    
 	coforall (ii,jj,kk) in  {2..imax-1, 2..jmax-1, 2..kmax-1} by (ptsx, ptsy, ptsz) {
 
     for i in ii..ii + ptsx-1 do{
     		for j in jj..jj + ptsy-1 do{
     		    for k in  kk..kk + ptsz-1  do{
     		  
                    f(i,j,k) =  (1.0 - w)*f(i,j,k) +  
                                w * (-dx2*tf(i,j,k) + f(i-1,j,k) + f(i+1,j,k) +
                                beta1*( f(i,j-1,k) + f(i,j+1,k) ) + 
                                beta2*( f(i,j,k-1) + f(i,j,k+1) ))/ ( 2.0*(beta1 + beta2 + 1.0) ) ;
            }   
        }   
    } 
    }  
 
		return f; 
       
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 

	proc calc_erro(f, tf, imax: int, jmax: int,  kmax: int,  dx2: real, dy2: real, dz2: real,  ptsx: int, ptsy: int,  ptsz: int){

	var errol : [1..imax,1..jmax, 1..kmax] real; 
	errol = 0.0;

	coforall (ii,jj,kk) in {2..imax-1, 2..jmax-1, 2..kmax-1} by (ptsx, ptsy, ptsz) {
 
     for i in ii..ii + ptsx-1   do{
     		for j in jj..jj + ptsy-1 do{
     		    for k in  kk..kk + ptsz-1  do{
     		  
		 
      		        errol(i,j,k) =  abs(tf(i,j,k) -  
			                               (f(i-1,j,k) - 2.0*f(i,j,k) + f(i+1,j,k))/dx2 -
			                               (f(i,j-1,k) - 2.0*f(i,j,k) + f(i,j+1,k))/dy2 -
			                               (f(i,j,k-1) - 2.0*f(i,j,k) + f(i,j,k+1))/dz2);   
 		    }
  	    }
  	}
     
     }
		return errol;

}
 

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 
    proc escreve(f, imax: int, jmax: int, kmax: int, dx: real, dy: real, dz: real){  
 
    use IO;                  
	var erro: [1..imax,1..jmax, 1..kmax] real; 
	erro = 0.0; 
	
    const saida = openwriter("saida_coforall.csv");  
   
    saida.writef("  %s  %s   %s  %s  %s  %s  %s  %s %s \n",  "x", ",", "y", ",", "z", ",", "f", ",", "e");
	for k in 1..kmax  do{    
	   z =  (k-0.5)*dz ;          
		
		for j in 1..jmax  do{
		  y =  (j-0.5)*dy;
		  
			for i in 1..imax  do{ 
			    x =  (i-0.5)*dx;
			            
			    erro(i,j,k) = abs(f(i,j,k) - sin(x)*cos(y)*sin(z));
			            
        		saida.writef("%10.12dr  %s %10.12dr   %s  %10.12dr  %s  %10.12dr  %s  %10.12dr  \n", 
        		            x, ",", y, ",", z, ",", f(i,j,k), ",", erro(i,j,k));
        	 }
 
		  }
    }
saida.close();
 
 }

Forall

// Finite Difference Solver for the Poisson equation (3D)  
// Author: JESUS, Anna  && GRION, Livia   
// Last Modification: 20/10/2022
// Parallel version
 
	use Time;
	var runtime: Timer;
	
	config const imax, jmax, kmax : int;	 
	const Lx = 1.0, Ly = 1.0,  Lz = 1.0 : real;
	const dx = Lx/imax, dy = Ly/jmax, dz = Lz/kmax : real;       
	const w  = 1.3 : real;										
	const dx2 = dx*dx, dy2 = dy*dy, dz2 = dz*dz: real;
	const beta1 = dx2/dy2 : real;	
	const beta2 = dx2/dz2 : real;	 
	const tol = 1E-6: real;
 	const Inner = {2..imax-1, 2..jmax-1,  2..kmax-1};
    const domPoisson = {1..imax, 1..jmax, 1..kmax};
    const NPRIN = 500;
     		
	var f, tf, residuo : [1..imax, 1..jmax, 1..kmax] real; 	  
	var i, j, k: int;
	var itera: int;
    var x, y, z, erro : real; 
    var max_f, min_f : real;
        
        
    itera = 0;
    residuo = 0.0;
	
	runtime.start();
	 
	   for k in 1..kmax  do{    
	   z =  (k-0.5)*dz ;          
		
		for j in 1..jmax  do{
		  y =  (j-0.5)*dy;
		  
			for i in 1..imax  do{ 
			    x =  (i-0.5)*dx;  		     
	       		    
                tf(i,j,k) = -3.0*sin(x)*cos(y)*sin(z);      		    
            
            }
          }
        
        }
        
            // Direcao - x
     	    for  j in 1..jmax do{     		   
     		    y  = (j-0.5)*dy;       		    
     		        for  k in 1..kmax do{
     		        z = (k-0.5)*dz;     		    
     		       
     		        x = 0.5;
                    f(1,j,k) = sin(x)*cos(y)*sin(z);
                
                    x = (imax-0.5)*dx;
                    f(imax,j,k) = sin(x)*cos(y)*sin(z);
     		         
     		    }
     	    } 
     		     
            // Direcao - y 
	        for  i in 1..imax do{ 
	            x = (i-0.5)*dx;
     		    
     		    for  k in 1..kmax do{
     		        z = (k-0.5)*dy;
     		    
     		        y = 0.5;
                    f(i,1,k) = sin(x)*cos(y)*sin(z); 
                    
                    y = (jmax-0.5)*dy;
                    f(i,jmax,k) = sin(x)*cos(y)*sin(z);  
     		     
     		    }
     		 }
     		      
     		   
	     for  i in 1..imax do{
	        x  = (i-0.5)*dx; 	       
     		   
     		   for  j in 1..jmax do{ 
     		   y = (j-0.5)*dy; 
     		    
     		   z = 0.5;          
               f(i,j,1) = sin(x)*cos(y)*sin(z);
                
               z = (kmax-0.5)*dz ;        
               f(i,j,kmax) = sin(x)*cos(y)*sin(z);  
     		    
     		     
     		 }
        } 
        
		erro = 1E0;
		itera = 0;
  
		    while (erro  >= tol) do{    
            
		    f = sor_method(f, tf, dx2, beta1, beta2, w);		 		 
			residuo  = calc_erro(f, tf, dx2, dy2, dz2);   
			erro = max reduce(residuo); 
            itera += 1;	  			
		  
		        if (mod(itera,NPRIN) == 0) then{		  		
		            writef('%i %6.12dr\n', itera, erro); 
		        } 
        
           } 
		  
   		 escreve(f, imax,  jmax, kmax, dx, dy, dz);
  
	runtime.stop(); 
	
    max_f = max reduce(f); 
    min_f = min reduce(f);  
	 	
	writef("%12.8er   %12.8er \n", max_f, min_f);				                 				     
    writef(" CPU TIME CHAPEL  = %10.16dr \n",runtime.elapsed());
	writef("%i   %12.8er \n",itera, erro);						                 
	writef("------------------------------------------------------------------------------ \n"); 
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 


//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	proc sor_method(f, tf,  dx2: real, beta1: real,  beta2: real,  w: real){ 
 
 
     forall (i,j,k) in Inner  do{
     		  
                    f(i,j,k) =  (1.0 - w)*f(i,j,k) +  
                                w * (-dx2*tf(i,j,k) + f(i-1,j,k) + f(i+1,j,k) +
                                beta1*( f(i,j-1,k) + f(i,j+1,k) ) + 
                                beta2*( f(i,j,k-1) + f(i,j,k+1) ))/ ( 2.0*(beta1 + beta2 + 1.0) ) ;
            
    }    
		return f;     
       
} 
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 

	proc calc_erro(f, tf,  dx2: real, dy2: real, dz2: real){

	var errol : [1..imax,1..jmax, 1..kmax] real; 
	errol = 0.0;

	forall (i,j,k) in Inner  do{ 
	
      		        errol(i,j,k) =  abs(tf(i,j,k) -  
			                               (f(i-1,j,k) - 2.0*f(i,j,k) + f(i+1,j,k))/dx2 -
			                               (f(i,j-1,k) - 2.0*f(i,j,k) + f(i,j+1,k))/dy2 -
			                               (f(i,j,k-1) - 2.0*f(i,j,k) + f(i,j,k+1))/dz2);    
  	    }
     
		return errol;

} 
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////


    proc escreve(f, imax: int, jmax: int, kmax: int, dx: real, dy: real, dz: real){  
 
    use IO;                  
	var erro: [1..imax,1..jmax, 1..kmax] real; 
    const saida = openwriter("saida_chapel_par.csv");  
   
   
    saida.writef("  %s  %s   %s  %s  %s  %s  %s  %s %s \n",  "x", ",", "y", ",", "z", ",", "f", ",", "e");
	for k in 1..kmax  do{    
	   z =  (k-0.5)*dz ;          
		
		for j in 1..jmax  do{
		  y =  (j-0.5)*dy;
		  
			for i in 1..imax  do{ 
			    x =  (i-0.5)*dx;
			            
			    erro(i,j,k) = abs(f(i,j,k) - sin(x)*cos(y)*sin(z));
			            
        		saida.writef("%10.12dr  %s %10.12dr   %s  %10.12dr  %s  %10.12dr  %s  %10.12dr  \n", 
        		            x, ",", y, ",", z, ",", f(i,j,k), ",", erro(i,j,k));
        	 }
 
		  }
    }
saida.close();
 
 } 

Anna, can you please attach the PDF and the raw chapel directly rather than require people to go to Google drive which requires separate access? Just a thought.

Did you implement all those earlier suggestions which were provided?

what suggestions? I can not upload the files here

OK. I did some cut and paste from the post and I now have the Chapel. I will see if I can compile it.

I will see what the Google drive request does..

I grabbed the PDF also.

Hello @annacfs, thanks for the question!

One very quick change that should make a reasonable performance difference: remove the return statements from sor_method and calc_erro.

  • In sor_method, f doesn't have to be returned because it's already an argument. (The array f will be modified by reference, so the f = part of the call is also not needed. It can just be called as: sor_method(f, tf, dx2, beta1, beta2, w);)

  • For calc_error, I'd delete the errol declaration and add an argument with the same name instead. So your proc would look like:

    proc calc_erro(errol, f, tf,  dx2: real, dy2: real, dz2: real){
    
    	forall (i,j,k) in Inner  do{ 
          		        errol(i,j,k) =  abs(tf(i,j,k) -  
    			                               (f(i-1,j,k) - 2.0*f(i,j,k) + f(i+1,j,k))/dx2 -
    			                               (f(i,j-1,k) - 2.0*f(i,j,k) + f(i,j+1,k))/dy2 -
    			                               (f(i,j,k-1) - 2.0*f(i,j,k) + f(i,j,k+1))/dz2);    
      	    } 
    }
    

    It can then be called as calc_erro(residuo, f, tf, dx2, dy2, dz2);, which will have the same effect, but will avoid allocating an extra array each iteration.

There were some suggestions made to the posts you made earlier in the year. I have not yet had time to look at the latest code in depth but it looks like you are still to implement all of those previous ideas.

Off the cuff,
a) there are far too many divisions and reliance on the compiler to optimize expressions; and
b) there are cases where a *forall should wrap the innermost loop which should use a for

I could be wrong but I think that you over parallelised the sor_method which is an example of (b).

forall (i, j) in do
{
    foreach k in 2..kmax-1 do
    {
       ...
    }
}

might be a better approach. I try to avoid having a forall as the innermost loop.

@jcorrado is on the right path - some of what he said was mentioned earlier in the year. The parameter t should/must have an inout intent. The overhead of making a copy would be horrendous.

Can you provide more detailed timings of individual segments of the program?

Thanks for the raw email.

Is Eq(2) recursive?

It seems that once you calculate f(i, j, 10) say, you need to use that value to then calculate f(i,j,11). Is that correct?

Consider the following. You might need to experiment. If imax is small, you might need to change this.

    proc sor_method(f, tf,  dx2: real, beta1: real,  beta2: real,  w: real)
    {
        // recompute what is stored inside Inner to avoid
        // using data outside the scope of this routine
        const ijk = f.domain.dims();
        const _i = ijk[0].expand(-1); // 2..imax-1
        const _j = ijk[1].expand(-1); // 2..jmax-1
        const _k = ijk[2].expand(-1); // 2..kmax-1
        const w1 = (1.0 - w);
        const w2 = (0.5 * w) / (beta1 + beta2 + 1.0);

        forall i in _i do // if imax is small, this might be sub-optimal
        {
            // the entire loop contents will be run as a single task
            // so ensure that each such task has lots of work to do

            for j in _j do
            {
                const ref fijm1 = f[i, j-1, ..], fijp1 = f[i, j+1, ..];
                const ref fim1j = f[i-1, j, ..], fip1j = f[i+1, j, ..];
                const ref tfij = tf[i, j, ..];
                ref fij = f[i, j, ..];
                var fijk = f[i, j, 1];

                for k in _k do // recursive - cannot be vectorized - sigh!!
                {
                    // superscalar and FMA opportunities here
                    // this ordering might be suboptimal but

                    const b0 = fim1j[k] + fip1j[k];
                    const b1 = fijm1[k] + fijp1[k];
                    const b2 = fijk + fij[k+1];
                    const bb = b1 * beta1 + b2 * beta2 + b0 - dx2 * tfij[k];

                    fijk = w1 * fij[k] + w2 * bb;
                    fij[k] = fijk;
                }
            }
        }
    }

Use the experience from implementing this with @jcorrado's advice to create a better calc_error

If vectorization is actually working well, the following might be better

    proc sor_kernel(_k, f, tf,  dx2: real, beta1: real)
    {
        var b : [_k] real;
        const ref fijm1 = f[i, j-1, ..];
        const ref fijp1 = f[i, j+1, ..];
        const ref fim1j = f[i-1, j, ..];
        const ref fip1j = f[i+1, j, ..];
        const ref tfij = tf[i, j, ..];

        foreach k in _k do // this can be vectorized
        {
            // there are superscalar and FMA opportunities here
            // note this ordering might be suboptimal here goes

            const b1 = fijm1[k] + fijp1[k];
            const b0 = fim1j[k] + fip1j[k];

            b[k] = b1 * beta1 + b0 - dx2 * tfij[k];
        }
        return b;
    }
    proc sor_method(f, tf,  dx2: real, beta1: real,  beta2: real,  w: real)
    {
        // recompute what is stored inside Inner to avoid
        // using data outside the scope of this routine
        const ijk = f.domain.dims();
        const _i = ijk[0].expand(-1); // 2..imax-1
        const _j = ijk[1].expand(-1); // 2..jmax-1
        const _k = ijk[2].expand(-1); // 2..kmax-1
        const w1 = (1.0 - w);
        const w2 = (0.5 * w) / (beta1 + beta2 + 1.0);

        forall i in _i do // if imax is small, this might be sub-optimal
        {
            // the entire loop contents will be run as a single task
            // so ensure that each such task has lots of work to do

            for j in _j do
            {
                ref fij = f[i, j, ..];
                var fijk = f[i, j, 1];
                const b = sor_kernel(_k, f, tf, dx2, beta1);

                for k in _k do // recursive - cannot be vectorized - sigh!!
                {
                    const b2 = fijk + fij[k+1];

                    fijk = w1 * fij[k] + w2 * (b2 * beta2 + b[k]);
                    fij[k] = fijk;
                }
            }
        }
    }

Try it. Experiment.

Nice!!!!! Let me think about it!
Thank u

I have not tested this. I hope I have not lost anything in the cutting and pasting I did. Others may have some better ideas. I am not sure how well the currrent version of the compiler is at vectorizing the code, something about which @mppf might have way better insight than I. Are you using the LLVM or GCC based variant of the compiler, and again, others can probably better advise you here but that will influence how well your code will vectorize. Maybe @jcorrado has some insightt here as well?

What are representative values of the problem size, i.e. what are the values of imax, jmax and kmax?

See also the discussion on Github issue #19636 which is related.

I tested imax=kmax=kmax = 130 in cluster. The size of the problem is on the order of 10^6; But my goal is to do 10^9

Thanks. My idea of using the forall on the outside loop is OK then.

Note that if you ultimately want to run this on multiple nodes of a cluster, you might need to use coforall and tweak the design of sor_method. But that can wait for now. Get things running nicely on your single node for now. I assume that the single node you are using in the cluster has two 28-core CPUs, meaning you have 56 cores available to you.

Also, in my opinion, your need to turn multi-threading off on your system. We always get better performance having a single thrread for each FPU. Others might have different experience using Chapel and I suggest you get their advice.

You could/should also look at unrolling the innermost loops.

I would also be curious whether a foreach loop is faster than an for loop unrolled by 4. If the former is no faster, then the original suggestion, with its innermost loop unrolled by at least 2 (or maybe 4), would probably be optimal.

You should force imax, jmax,and kmax to be an odd multiple of 2, This means the number of elements in a range like

2..imax-1

is always a multiple of 4. This makes loop unrolling a lot easier to implement. My 2c.

I ran your task on a 24-core system,. I expected a gain of about 11,

With the original code, I get
a) a serial performance of 690 seconds in 15000+ iterations
b) a parallel performance of 53 seconds in 22000+ iterations

The performance gain is about a factor of 13 which is slightly better that I thought. Do you numbers agree to within some tolerance? The parallelization of that recursive formula worries me.

When I run the modified version using the approach I suggested, I get
b) a parallel performance of 73 seconds in 15000+ iterations

The performance gain is about 9.5 which is still pretty good and the iteration count matches.

EDITED after the event

I rewrote in a simpler style, still handling the recursive aspects of the 3rd dimension loop, and the performance gain is now nearly 22 on the 24 core machine (with a total of 8 memory channels) and nearly 28 for the 52-core machine (with a total of 12 memory channels). The figure of 28 is misleading because the serial version runs at 3.9Ghz if it is the only task on this CPU whereas the parallel version with 52 cores runs at 2.1Ghz. If you scale that up, you get a factor of about 50 on a 52 core system. I will post the code tomorrow.

That said, I think there is some issue the allocation of cores on your cluster.

Can you please provide the Fortran+MPI reference code?

What do your 2 graphs mean? What serial run time do you get?

When you say 28 cores, 56 threads, what are the actual CPUs, i.e.

cat /proc/cpuinfo

on the Linux machine. The machine I am using has two 12-core Xeon 2650-v4 CPUs, so a total of 24 cores (and as I noted, only 24 threads which avoids 2 separate threads fighting over access to a single FPU as there as only as many FPUs as there are cores).

I also ran it on a two 18core Xeon 2695v4 which has more cores and there is no performance gain. The performance is somewhat dominated by memory access (or cache misses) so you quickly hit the limit. When I run it on a two 26core Xeon Gold 6230R with 50% more memory bandwidth, I see the run-times for the parallel version drop by nearly 50%.