Follow up: I came up with a workaround by using _remoteVarWrapper to make a GPU resource manager:
record remote {
type eltType;
var device: locale;
var item: _remoteVarWrapper(eltType);
var _parentDevice: locale; // is this necessary?
proc init(item: ?eltType,device: locale) {
this.eltType = eltType;
this.device = device;
this.item = chpl__buildRemoteWrapper(device,eltType,item);
this._parentDevice = here;
}
proc init(item: ?eltType) { this.init(item,here); }
proc init(type eltType) {
this.eltType = eltType;
this.device = here;
this._parentDevice = here;
}
proc ref access() ref {
// if here != this.device { try! throw new Error("Trying to access memory on wrong device!"); }
if here != this.device {
this.to(here);
if debug then writeln("moved " + this.device.name + " -> " + here.name);
}
return this.item.get();
}
proc ref to(device: locale) {
if this.device == device then return;
if here != this._parentDevice { // this may not be the best path for the data flow
on this._parentDevice {
this.to(device);
}
} else {
this.device = device;
this.item = chpl__buildRemoteWrapper(device,eltType,this.item.get());
}
}
}
Then you can treat references to data on different devices as record values, and be explicit about which device they are on.
var t: tensor(1) = [i in {0..<10}] i:real;
var rt: remote(tensor(1)) = new remote(t);
writeln(rt.access());
rt.to(device);
on device {
rt.access().data += 1.0;
}
rt.to(here);
writeln(rt.access());
(_domain = {0..9}, data = 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0)
0 (gpu 0): gputil.chpl:2079: copy from host to device, 80 bytes, commid 275
0 (gpu 0): $CHPL_HOME/modules/internal/ChapelArray.chpl:2699: kernel launch (block size: 512x1x1)
0 (gpu 0): gputil.chpl:2079: copy from device to host, 80 bytes, commid 273
(_domain = {0..9}, data = 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0)
As shown, the data transfers only occur when .to is called. Another (cleaner) example using plain arrays:
proc remote_sin(ref rx: remote(?t)) {
on rx.device {
use Math;
ref data = rx.access();
data = sin(data);
}
}
const data = [i in {0..<10}] i:real;
var rt = new remote(data);
writeln(rt.access());
rt.to(device);
remote_sin(rt);
rt.to(here);
writeln(rt.access());
This seems to work fine, and you can save values of remote(...) as class/record fields without copying the underlying data. But I am unsure if this is approach has any issues.
I think by adding a init= proc, one could have a system for programmatically transferring data between GPU and program memory, without writing the transfers syntactically using on statements.