External Issue: Performance comparison of channels in Chapel and Golang

18432, "Yudhishthira1406", "Performance comparison of channels in Chapel and Golang", "2021-09-16T12:50:28Z"

Summary of Problem

As part of GSoC 2021, I am adding Go-Style Channels to chapel. A major part of this project is to compare the performance between golang and chapel variants. I ran the following benchmarks in Golang and Chapel.
NOTE: All chapel programs are compiled with --fast flag.

Golang Benchmarks

chanUncontended.go
package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

func main() {
	C := 100
	N := 10000
	runtime.GOMAXPROCS(4)
	procs := 4
	start := time.Now()
	var wg sync.WaitGroup
	wg.Add(procs)
	for i := 0; i < procs; i++ {
		go func() {
			defer wg.Done()
			for k := 0; k < N/procs; k++ {
				myc := make(chan int, C)
				for j := 0; j < C; j++ {
					myc <- 0
				}
				for j := 0; j < C; j++ {
					<-myc
				}
			}

		}()
	}
	wg.Wait()
	elapsed := float64(time.Since(start) / time.Millisecond)
	fmt.Printf("%f\n", elapsed/float64(N))
}



chanContended.go
package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

func main() {
	C := 100
	N := 10000
	runtime.GOMAXPROCS(4)
	procs := 4
	var wg sync.WaitGroup
	wg.Add(procs)
	myc := make(chan int, C*procs)
	start := time.Now()
	for p := 0; p < procs; p++ {
		go func() {
			for i := 0; i < N/procs; i++ {
				for j := 0; j < C; j++ {
					myc <- 0
				}
				for j := 0; j < C; j++ {
					x1 := <-myc
					x1++
				}
			}
			wg.Done()
		}()
	}
	wg.Wait()
	elapsed := float64(time.Since(start) / time.Millisecond)
	fmt.Printf("%f\n", elapsed/float64(N))
}

selectUncontended.go
package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

func main() {
	N := 10000
	runtime.GOMAXPROCS(4)
	procs := 4
	start := time.Now()
	var wg sync.WaitGroup
	wg.Add(procs)
	for i := 0; i < procs; i++ {
		go func() {
			defer wg.Done()
			myc1 := make(chan int, 1)
			myc2 := make(chan int, 1)
			myc1 <- 0
			for j := 0; j < N/procs; j++ {
				select {
				case <-myc1:
					myc2 <- 0
				case <-myc2:
					myc1 <- 0
				}
			}
		}()
	}
	wg.Wait()
	elapsed := float64(time.Since(start) / time.Millisecond)
	fmt.Printf("%f\n", elapsed/float64(N))
}
selectSyncContended.go
package main

import (
	"fmt"
	"runtime"
	"sync"
	"time"
)

func main() {
	N := 10000
	runtime.GOMAXPROCS(4)
	procs := 4
	start := time.Now()
	myc1 := make(chan int)
	myc2 := make(chan int)
	myc3 := make(chan int)
	done := make(chan int)
	var wg sync.WaitGroup
	wg.Add(procs)
	for i := 0; i < procs; i++ {
		go func() {
			defer wg.Done()
			go func() {
				for {
					select {
					case myc1 <- 0:
					case myc2 <- 0:
					case myc3 <- 0:
					case <-done:
						return
					}
				}
			}()

			for j := 0; j < N/procs; j++ {
				select {
				case <-myc1:
				case <-myc2:
				case <-myc3:
				}
			}
		}()
	}
	wg.Wait()
	close(done)
	elapsed := float64(time.Since(start) / time.Millisecond)
	fmt.Printf("%f\n", elapsed/float64(N))
}

Chapel Benchmarks

chanUncontended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
                                                       else here.maxTaskPar;
config const C = 100;
config const performanceTest = false;
var t : Timer;
t.start();

forall i in 0..#n {
    test();
}
t.stop();
var elapsed = t.elapsed();

if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");

proc test() {
    var myc = new chan(int, C);
    for i in 0..#C {
        myc.send(0);
    }
    for i in 0..#C {
        var x1 : int;
        myc.recv(x1);
    }
}

chanContended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
                                                       else here.maxTaskPar;
config const C = 100;
config const performanceTest = false;
var t : Timer;
var myc = new chan(int, C * numTasksPerLocale);
t.start();


forall i in 0..#n {
    test();
}
t.stop();
var elapsed = t.elapsed();

if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");

proc test() {
    for i in 0..#C {
        myc.send(0);
    }
    for i in 0..#C {
        var x1 : int;
        myc.recv(x1);
    }
}

selectUncontended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
                                                       else here.maxTaskPar;
config const performanceTest = false;
var cnt = n / numTasksPerLocale;
var t : Timer;
t.start();
coforall tid in 0..#numTasksPerLocale {
    test();
}
t.stop();
var elapsed = t.elapsed();

if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");

proc test() {
    var myc1 = new chan(int, 1);
    var myc2 = new chan(int, 1);
    myc1.send(0);
    for i in 0..#cnt {
        var x1 : int;
        var x2 : int;
        var case1 : SelectBaseClass = new shared SelectCase(x1, myc1, selectOperation.recv, 0);
        var case2 : SelectBaseClass = new shared SelectCase(x2, myc2, selectOperation.recv, 0);
        var arr = [case1, case2];
        var idx = selectProcess(arr);
        if idx == 0 then myc2.send(0);
        else myc1.send(0);
    }
}
selectSyncContended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
                                                       else here.maxTaskPar;
config const performanceTest = false;
var t : Timer;
t.start();
var myc1 = new chan(int);
var myc2 = new chan(int);
var myc3 = new chan(int);
var done = new chan(int);
sync {
    for tid in 0..#numTasksPerLocale {
        begin with (ref myc1, ref myc2, ref myc3, ref done) {
            while(true) {
                var x1, x2, x3, x4 : int;
                var case1 : SelectBaseClass = new shared SelectCase(x1, myc1, selectOperation.send, 4);
                var case2 : SelectBaseClass = new shared SelectCase(x2, myc2, selectOperation.send, 5);
                var case3 : SelectBaseClass = new shared SelectCase(x3, myc3, selectOperation.send, 6);
                var case4 : SelectBaseClass = new shared SelectCase(x4, done, selectOperation.recv, 3);
                var idx = selectProcess([case1, case2, case3, case4]);
                if idx == 3 then break;
            }
        }
    }
    coforall tid in 0..#numTasksPerLocale with (ref myc1, ref myc2, ref myc3) {
        test();
    }
    done.close();
}
t.stop();
var elapsed = t.elapsed();

if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");

proc test() {
    for i in 0..#(n / numTasksPerLocale) {
        var x1, x2, x3 : int;
        var case1 : SelectBaseClass = new shared SelectCase(x1, myc1, selectOperation.recv, 0);
        var case2 : SelectBaseClass = new shared SelectCase(x2, myc2, selectOperation.recv, 1);
        var case3 : SelectBaseClass = new shared SelectCase(x3, myc3, selectOperation.recv, 2);
        var idx = selectProcess([case1, case2, case3]);
    }
}

Results

Benchmark Chapel performance (ms) Golang Performance (ms)
chanUncontended 0.0012027 0.002500
chanContended 0.0061497 0.0019900
selectUncontended 0.0090694 0.000200
selectSyncContended 0.0095526 0.004900

Observation

  • Chapel channels seem to perform better as compared to original Go-channels.
  • Select statements in chapel seem to have lower performance than Go. This might be due to the fact that memory is allocated for select cases each time the select statement is called.

Configuration Information

  • Output of chpl --version: 1.25.0 (pre-release)
  • Output of $CHPL_HOME/util/printchplenv --anonymize:
CHPL_TARGET_PLATFORM: linux64
CHPL_TARGET_COMPILER: gnu
CHPL_TARGET_ARCH: x86_64
CHPL_TARGET_CPU: native
CHPL_LOCALE_MODEL: flat
CHPL_COMM: none
CHPL_TASKS: qthreads *
CHPL_LAUNCHER: none
CHPL_TIMERS: generic
CHPL_UNWIND: none
CHPL_MEM: jemalloc
CHPL_ATOMICS: cstdlib
CHPL_GMP: none
CHPL_HWLOC: bundled
CHPL_RE2: bundled
CHPL_LLVM: none *
CHPL_AUX_FILESYS: none
  • Back-end compiler and version, e.g. gcc --version: gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0