28040, "markthitrin", "[Performance]: Optimization missed on ternary operator when used with arrays", "2025-11-12T11:12:39Z"
Summary of Problem
Description:
While I was experimenting with my project, I found out that the compiler miss the opportunity to optimize the ternary operator when I uses it with arrays (The branchless vectorized implementation). I tried to implement the same function in C++ and found that this got optimized normally. Moreover, when I tried using C pointers to reference the arrays, the optimization happened normally.
I am not sure if I did something wrong, please let me know if I missed something.
Is this issue currently blocking your progress?
No
Steps to Reproduce
Source Code:
main.chpl
use Config;
use Time;
use Random;
use Function;
use Testclass;
param iterationTest = 100;
param stdTest = 100;
var t:stopwatch;
var rng = new randomStream(real(32), seed=0);
var A: [0..#(dim1)] real(32);
var B: [0..#(dim1)] real(32);
var C: [0..#(dim1)] real(32);
fillRandom(A, -1, 1);
fillRandom(B, -1, 1);
var time:[0..stdTest] real;
var totalTime: real;
var std: real;
// var foo = new testclass();
var checksum: real(32) = 0.0;
// Running benhcmark
for i in 0..#stdTest {
t.reset();
t.start();
for j in 0..#iterationTest {
func(A,B,C);
checksum += C[0];
}
t.stop();
time[i] = t.elapsed();
totalTime += time[i];
}
// mean and std calculation
var meanPerTest = totalTime / stdTest;
for i in 0..#stdTest {
var x = time[i] - meanPerTest;
std += x * x;
}
std /= stdTest - 1;
std = sqrt(std);
writeln(checksum);
writeln("mean Time per iteration : ", meanPerTest / iterationTest);
writeln("std Time per iteration : ", std / sqrt(iterationTest));
writeln("mean Iteration per second : ", iterationTest / meanPerTest);
writeln("std Iteration per second : ", iterationTest * iterationTest / (meanPerTest * meanPerTest) * (std / sqrt(iterationTest)));
Function.chpl
use Config;
proc func(ref A: [] real(32), ref B: [] real(32), ref C: [] real(32)) : void {
for i in 0..#dim1 {
C[i] = if A[i] >= 0 then B[i] else 0.0:real(32);
}
}
Config.chpl
config param dim1: int = 1 << 16;
Compile command:
chpl ./main.chpl --fast --no-ieee-float --vector-library=LIBMVEC-X86
--fast optimization flag enabled?
Yes
Execution command:
./main
Output
-9403.3
mean Time per iteration : 0.000277119
std Time per iteration : 5.5431e-06
mean Iteration per second : 3608.56
std Iteration per second : 72.1807
perf report Result
8.19 │ 7f0:┌─→vmovss %xmm1,(%rcx,%rdi,4) ▒
8.34 │ │ inc %rdi ◆
8.73 │ │ cmp $0x10000,%rdi ▒
│ │↑ je 7d0 ▒
8.14 │ 801:│ vxorps %xmm1,%xmm1,%xmm1 ▒
16.73 │ │ vucomiss (%rax,%rdi,4),%xmm1 ▒
8.67 │ │↑ ja 7f0 ▒
14.11 │ │ mov 0x68(%rsi),%r8 ▒
13.36 │ │ vmovss (%r8,%rdi,4),%xmm1 ▒
13.69 │ └──jmp 7f0
Configuration Information
- Output of
chpl --version:
chpl version 2.7.0 pre-release (6e9ce7050d)
built with LLVM version 19.1.1
available LLVM targets: xtensa, m68k, xcore, x86-64, x86, wasm64, wasm32, ve, systemz, sparcel, sparcv9, sparc, riscv64, riscv32, ppc64le, ppc64, ppc32le, ppc32, nvptx64, nvptx, msp430, mips64el, mips64, mipsel, mips, loongarch64, loongarch32, lanai, hexagon, bpfeb, bpfel, bpf, avr, thumbeb, thumb, armeb, arm, amdgcn, r600, aarch64_32, aarch64_be, aarch64, arm64_32, arm64
Copyright 2020-2025 Hewlett Packard Enterprise Development LP
Copyright 2004-2019 Cray Inc.
(See LICENSE file for more details)
- Output of
$CHPL_HOME/util/printchplenv --anonymize:
CHPL_TARGET_PLATFORM: linux64
CHPL_TARGET_COMPILER: llvm
CHPL_TARGET_ARCH: x86_64
CHPL_TARGET_CPU: native
CHPL_LOCALE_MODEL: flat
CHPL_COMM: none
CHPL_TASKS: qthreads
CHPL_LAUNCHER: none
CHPL_TIMERS: generic
CHPL_UNWIND: none
CHPL_TARGET_MEM: jemalloc
CHPL_ATOMICS: cstdlib
CHPL_GMP: none
CHPL_HWLOC: bundled
CHPL_RE2: bundled
CHPL_LLVM: system
CHPL_AUX_FILESYS: none
- Back-end compiler and version, e.g.
gcc --versionorclang --version:
gcc (Ubuntu 14.2.0-4ubuntu2) 14.2.0
Copyright (C) 2024 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Ubuntu clang version 19.1.1 (1ubuntu1)
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/lib/llvm-19/bin
Addition Note
Here are the results of using C pointers to reference the arrays
Code Changes
main.cpp
param iterationTest = 1000;
param stdTest = 100;
Function.chpl
use CTypes;
proc func(ref A: [] real(32), ref B: [] real(32), ref C: [] real(32)) : void {
var A_ptr = c_ptrTo(A);
var B_ptr = c_ptrTo(B);
var C_ptr = c_ptrTo(C);
for i in 0..#dim1 {
C_ptr[i] = if A_ptr[i] >= 0 then B_ptr[i] else 0.0:real(32);
}
}
Output
-5423.39
mean Time per iteration : 7.38943e-06
std Time per iteration : 4.29594e-07
mean Iteration per second : 1.35328e+05
std Iteration per second : 7867.48
perf report Result
8.62 │ 880:┌─→vcmpleps -0x60(%r13,%rbp,4),%ymm5,%ymm1 ◆
10.74 │ │ vcmpleps -0x40(%r13,%rbp,4),%ymm5,%ymm2 ▒
12.32 │ │ vcmpleps -0x20(%r13,%rbp,4),%ymm5,%ymm3 ▒
11.80 │ │ vcmpleps 0x0(%r13,%rbp,4),%ymm5,%ymm4 ▒
13.82 │ │ vmaskmovps -0x60(%r12,%rbp,4),%ymm1,%ymm1 ▒
8.25 │ │ vmaskmovps -0x40(%r12,%rbp,4),%ymm2,%ymm2 ▒
10.27 │ │ vmaskmovps -0x20(%r12,%rbp,4),%ymm3,%ymm3 ▒
9.09 │ │ vmaskmovps (%r12,%rbp,4),%ymm4,%ymm4 ▒
4.86 │ │ vmovups %ymm1,-0x60(%r15,%rbp,4) ▒
1.68 │ │ vmovups %ymm2,-0x40(%r15,%rbp,4) ▒
1.16 │ │ vmovups %ymm3,-0x20(%r15,%rbp,4) ▒
1.71 │ │ vmovups %ymm4,(%r15,%rbp,4) ▒
2.70 │ │ add $0x20,%rbp ▒
2.49 │ ├──cmp $0x10000,%rbp ▒
│ └──jne 880