18432, "Yudhishthira1406", "Performance comparison of channels in Chapel and Golang", "2021-09-16T12:50:28Z"
Summary of Problem
As part of GSoC 2021, I am adding Go-Style Channels to chapel. A major part of this project is to compare the performance between golang and chapel variants. I ran the following benchmarks in Golang and Chapel.
NOTE: All chapel programs are compiled with --fast
flag.
Golang Benchmarks
chanUncontended.go
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
func main() {
C := 100
N := 10000
runtime.GOMAXPROCS(4)
procs := 4
start := time.Now()
var wg sync.WaitGroup
wg.Add(procs)
for i := 0; i < procs; i++ {
go func() {
defer wg.Done()
for k := 0; k < N/procs; k++ {
myc := make(chan int, C)
for j := 0; j < C; j++ {
myc <- 0
}
for j := 0; j < C; j++ {
<-myc
}
}
}()
}
wg.Wait()
elapsed := float64(time.Since(start) / time.Millisecond)
fmt.Printf("%f\n", elapsed/float64(N))
}
chanContended.go
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
func main() {
C := 100
N := 10000
runtime.GOMAXPROCS(4)
procs := 4
var wg sync.WaitGroup
wg.Add(procs)
myc := make(chan int, C*procs)
start := time.Now()
for p := 0; p < procs; p++ {
go func() {
for i := 0; i < N/procs; i++ {
for j := 0; j < C; j++ {
myc <- 0
}
for j := 0; j < C; j++ {
x1 := <-myc
x1++
}
}
wg.Done()
}()
}
wg.Wait()
elapsed := float64(time.Since(start) / time.Millisecond)
fmt.Printf("%f\n", elapsed/float64(N))
}
selectUncontended.go
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
func main() {
N := 10000
runtime.GOMAXPROCS(4)
procs := 4
start := time.Now()
var wg sync.WaitGroup
wg.Add(procs)
for i := 0; i < procs; i++ {
go func() {
defer wg.Done()
myc1 := make(chan int, 1)
myc2 := make(chan int, 1)
myc1 <- 0
for j := 0; j < N/procs; j++ {
select {
case <-myc1:
myc2 <- 0
case <-myc2:
myc1 <- 0
}
}
}()
}
wg.Wait()
elapsed := float64(time.Since(start) / time.Millisecond)
fmt.Printf("%f\n", elapsed/float64(N))
}
selectSyncContended.go
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
func main() {
N := 10000
runtime.GOMAXPROCS(4)
procs := 4
start := time.Now()
myc1 := make(chan int)
myc2 := make(chan int)
myc3 := make(chan int)
done := make(chan int)
var wg sync.WaitGroup
wg.Add(procs)
for i := 0; i < procs; i++ {
go func() {
defer wg.Done()
go func() {
for {
select {
case myc1 <- 0:
case myc2 <- 0:
case myc3 <- 0:
case <-done:
return
}
}
}()
for j := 0; j < N/procs; j++ {
select {
case <-myc1:
case <-myc2:
case <-myc3:
}
}
}()
}
wg.Wait()
close(done)
elapsed := float64(time.Since(start) / time.Millisecond)
fmt.Printf("%f\n", elapsed/float64(N))
}
Chapel Benchmarks
chanUncontended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
else here.maxTaskPar;
config const C = 100;
config const performanceTest = false;
var t : Timer;
t.start();
forall i in 0..#n {
test();
}
t.stop();
var elapsed = t.elapsed();
if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");
proc test() {
var myc = new chan(int, C);
for i in 0..#C {
myc.send(0);
}
for i in 0..#C {
var x1 : int;
myc.recv(x1);
}
}
chanContended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
else here.maxTaskPar;
config const C = 100;
config const performanceTest = false;
var t : Timer;
var myc = new chan(int, C * numTasksPerLocale);
t.start();
forall i in 0..#n {
test();
}
t.stop();
var elapsed = t.elapsed();
if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");
proc test() {
for i in 0..#C {
myc.send(0);
}
for i in 0..#C {
var x1 : int;
myc.recv(x1);
}
}
selectUncontended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
else here.maxTaskPar;
config const performanceTest = false;
var cnt = n / numTasksPerLocale;
var t : Timer;
t.start();
coforall tid in 0..#numTasksPerLocale {
test();
}
t.stop();
var elapsed = t.elapsed();
if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");
proc test() {
var myc1 = new chan(int, 1);
var myc2 = new chan(int, 1);
myc1.send(0);
for i in 0..#cnt {
var x1 : int;
var x2 : int;
var case1 : SelectBaseClass = new shared SelectCase(x1, myc1, selectOperation.recv, 0);
var case2 : SelectBaseClass = new shared SelectCase(x2, myc2, selectOperation.recv, 0);
var arr = [case1, case2];
var idx = selectProcess(arr);
if idx == 0 then myc2.send(0);
else myc1.send(0);
}
}
selectSyncContended.chpl
use Channel;
use Time;
config const n = 10000;
const numTasksPerLocale = if dataParTasksPerLocale > 0 then dataParTasksPerLocale
else here.maxTaskPar;
config const performanceTest = false;
var t : Timer;
t.start();
var myc1 = new chan(int);
var myc2 = new chan(int);
var myc3 = new chan(int);
var done = new chan(int);
sync {
for tid in 0..#numTasksPerLocale {
begin with (ref myc1, ref myc2, ref myc3, ref done) {
while(true) {
var x1, x2, x3, x4 : int;
var case1 : SelectBaseClass = new shared SelectCase(x1, myc1, selectOperation.send, 4);
var case2 : SelectBaseClass = new shared SelectCase(x2, myc2, selectOperation.send, 5);
var case3 : SelectBaseClass = new shared SelectCase(x3, myc3, selectOperation.send, 6);
var case4 : SelectBaseClass = new shared SelectCase(x4, done, selectOperation.recv, 3);
var idx = selectProcess([case1, case2, case3, case4]);
if idx == 3 then break;
}
}
}
coforall tid in 0..#numTasksPerLocale with (ref myc1, ref myc2, ref myc3) {
test();
}
done.close();
}
t.stop();
var elapsed = t.elapsed();
if performanceTest then writeln("Time per operation : ", elapsed * 1000 / n, " ms");
proc test() {
for i in 0..#(n / numTasksPerLocale) {
var x1, x2, x3 : int;
var case1 : SelectBaseClass = new shared SelectCase(x1, myc1, selectOperation.recv, 0);
var case2 : SelectBaseClass = new shared SelectCase(x2, myc2, selectOperation.recv, 1);
var case3 : SelectBaseClass = new shared SelectCase(x3, myc3, selectOperation.recv, 2);
var idx = selectProcess([case1, case2, case3]);
}
}
Results
Benchmark Chapel performance (ms) Golang Performance (ms) chanUncontended 0.0012027 0.002500 chanContended 0.0061497 0.0019900 selectUncontended 0.0090694 0.000200 selectSyncContended 0.0095526 0.004900
Observation
- Chapel channels seem to perform better as compared to original Go-channels.
- Select statements in chapel seem to have lower performance than Go. This might be due to the fact that memory is allocated for select cases each time the select statement is called.
Configuration Information
- Output of
chpl --version
: 1.25.0 (pre-release) - Output of
$CHPL_HOME/util/printchplenv --anonymize
:
CHPL_TARGET_PLATFORM: linux64
CHPL_TARGET_COMPILER: gnu
CHPL_TARGET_ARCH: x86_64
CHPL_TARGET_CPU: native
CHPL_LOCALE_MODEL: flat
CHPL_COMM: none
CHPL_TASKS: qthreads *
CHPL_LAUNCHER: none
CHPL_TIMERS: generic
CHPL_UNWIND: none
CHPL_MEM: jemalloc
CHPL_ATOMICS: cstdlib
CHPL_GMP: none
CHPL_HWLOC: bundled
CHPL_RE2: bundled
CHPL_LLVM: none *
CHPL_AUX_FILESYS: none
- Back-end compiler and version, e.g.
gcc --version
: gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0