In the previous part, you might have seen some (but small) performance differences between different mapping decisions. Those differences of course will become more pronounced once the problem size you run your program gets bigger. The goal of this exercise is to explore different mapping choices and figure out which mappings perform the best and why.
We give you in this part of the exercise a bit bigger problem with 320K nodes and 1.3M wires in 128 circuit pieces. The number of time steps for each iteration has also changed from 10K steps to 100K steps. With this setting, your task of this exercise is:
As a starting point, we are giving you mapping rules that map the three simulation tasks to CPUs and map their regions to the system memory. For your information, the system you are going to use has four nodes each of which has 16 CPUs and 4 GPUs. We allocated each of them a big enough memory of each kind (system, RDMA, GPU framebuffer, and GPU zero-copy memory), feel free to map your regions on different memories as you wish.
The syntax guide from the preivous part will be useful, so we give it below as well.
In [ ]:
__demand(__cuda) task T ... -- Generates both x86 and CUDA variants for task T
bishop ... end -- Starts a bishop mapper
TE { target : V; } -- Sets value V as the target of a task that matches TE
TE RE { target : V; } -- Sets value V as the target of a region that matches RE and whose task matches TE
-- Task Element (TE)
task -- Selects any tasks
task#T -- Selects tasks named T
task[isa=I] -- Selects tasks mapped to a processor that supports ISA I
TE[target=$T] -- Selects tasks that satisfy TE and then binds their target to $T
TE[index=$P] -- Selects tasks that satisfy TE and then binds their point in the launch domain to $P
-- Region Element (RE)
region -- Selects any regions
region#P -- Selects regions named P in the signature
-- Processor objects
processors -- A list of processors in the whole system
processors[isa=I] -- A list of processors that support ISA I (either x86 or cuda)
processors[N] -- The N-th processor in the list
L.size -- The size of list L of processors
P.memories -- A list of memories visible to processor P
-- Memory objects
memories -- A list of memories in the whole system
memories[kind=K] -- A list of memories of kind K (sysmem, regmem, fbmem, or zcmem)
memories[N] -- The N-th memory in the list
L.size -- The size of list L of memories
-- Expressions for list indices
$P[0] -- The first coordinate of point $P
E1 + E2, E1 - E2, E1 * E2, E1 / E2, E1 % E2 -- Usual integer arithmetic expressions
In [ ]:
import "regent"
import "bishop"
local c = regentlib.c
struct Currents {
_0 : float,
_1 : float,
_2 : float,
}
struct Voltages {
_1 : float,
_2 : float,
}
fspace Node {
capacitance : float,
leakage : float,
charge : float,
voltage : float,
}
fspace Wire(rpn : region(Node), rsn : region(Node), rgn : region(Node)) {
in_node : ptr(Node, rpn, rsn),
out_node : ptr(Node, rpn, rsn, rgn),
inductance : float,
resistance : float,
capacitance : float,
current : Currents,
voltage : Voltages,
}
local CktConfig = require("session3/circuit_config")
local helper = require("session3/circuit_helper")
local WS = 3
local dT = 1e-7
mapper
task#calculate_new_currents[index=$p],
task#distribute_charge[index=$p],
task#update_voltages[index=$p]
{
target : processors[isa=x86][$p[0] % processors[isa=x86].size];
}
task[isa=x86 and target=$proc] region
{
target : $proc.memories[kind=sysmem];
}
end
__demand(__cuda)
task calculate_new_currents(steps : uint,
rpn : region(Node),
rsn : region(Node),
rgn : region(Node),
rw : region(Wire(rpn, rsn, rgn)))
where
reads(rpn.voltage, rsn.voltage, rgn.voltage,
rw.{in_node, out_node, inductance, resistance, capacitance}),
reads writes(rw.{current, voltage})
do
var rdT : float = 1.0 / dT
__demand(__vectorize)
for w in rw do
var temp_v : float[WS + 1]
var temp_i : float[WS]
var old_i : float[WS]
var old_v : float[WS - 1]
temp_i[0] = w.current._0
temp_i[1] = w.current._1
temp_i[2] = w.current._2
for i = 0, WS do old_i[i] = temp_i[i] end
temp_v[1] = w.voltage._1
temp_v[2] = w.voltage._2
for i = 0, WS - 1 do old_v[i] = temp_v[i + 1] end
-- Pin the outer voltages to the node voltages.
temp_v[0] = w.in_node.voltage
temp_v[WS] = w.out_node.voltage
-- Solve the RLC model iteratively.
var L : float = w.inductance
var rR : float = 1.0 / w.resistance
var rC : float = 1.0 / w.capacitance
for j = 0, steps do
-- First, figure out the new current from the voltage differential
-- and our inductance:
-- dV = R*I + L*I' ==> I = (dV - L*I')/R
for i = 0, WS do
temp_i[i] = ((temp_v[i + 1] - temp_v[i]) -
(L * (temp_i[i] - old_i[i]) * rdT)) * rR
end
-- Now update the inter-node voltages.
for i = 0, WS - 1 do
temp_v[i + 1] = old_v[i] + dT * (temp_i[i] - temp_i[i + 1]) * rC
end
end
-- Write out the results.
w.current._0 = temp_i[0]
w.current._1 = temp_i[1]
w.current._2 = temp_i[2]
w.voltage._1 = temp_v[1]
w.voltage._2 = temp_v[2]
end
end
__demand(__cuda)
task distribute_charge(rpn : region(Node),
rsn : region(Node),
rgn : region(Node),
rw : region(Wire(rpn, rsn, rgn)))
where
reads(rw.{in_node, out_node, current._0, current._2}),
reduces +(rpn.charge, rsn.charge, rgn.charge)
do
for w in rw do
var in_current = -dT * w.current._0
var out_current = dT * w.current._2
w.in_node.charge += in_current
w.out_node.charge += out_current
end
end
__demand(__cuda)
task update_voltages(rn : region(Node))
where
reads(rn.{capacitance, leakage}),
reads writes(rn.{voltage, charge})
do
for n in rn do
var voltage = n.voltage + n.charge / n.capacitance
voltage = voltage * (1.0 - n.leakage)
n.voltage = voltage
n.charge = 0.0
end
end
task toplevel()
var conf : CktConfig
conf:initialize_from_command()
conf:show()
var num_circuit_nodes = conf.num_pieces * conf.nodes_per_piece
var num_circuit_wires = conf.num_pieces * conf.wires_per_piece
var rn = region(ispace(ptr, num_circuit_nodes), Node)
var rw = region(ispace(ptr, num_circuit_wires), Wire(wild, wild, wild))
new(ptr(Node, rn), num_circuit_nodes)
new(ptr(Wire(wild, wild, wild), rw), num_circuit_wires)
c.printf("Generating a random circuit...\n")
helper.generate_random_circuit(rn, rw, conf)
var colors = ispace(int1d, conf.num_pieces)
var pn_equal = partition(equal, rn, colors)
var pw = preimage(rw, pn_equal, rw.in_node)
var pn_extrefs = image(rn, preimage(rw, pn_equal, rw.out_node) - pw, rw.out_node)
var pn_private = pn_equal - pn_extrefs
var pn_shared = pn_equal & pn_extrefs
var pn_ghost = image(rn, pw, rw.out_node) - pn_equal
__demand(__parallel)
for i = 0, conf.num_pieces do
helper.initialize_pointers(pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
end
helper.wait_for(helper.block(rn, rw))
c.printf("Starting main simulation loop\n")
var ts_start = helper.timestamp()
for j = 0, conf.num_loops do
for i = 0, conf.num_pieces do
calculate_new_currents(conf.steps, pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
end
for i = 0, conf.num_pieces do
distribute_charge(pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
end
for i = 0, conf.num_pieces do
update_voltages(pn_equal[i])
end
end
-- Wait for all previous tasks to complete and measure the elapsed time.
var _ = 0
for i = 0, conf.num_pieces do
_ += helper.block(pn_equal[i], pw[i])
end
helper.wait_for(_)
var ts_end = helper.timestamp()
c.printf("simulation complete\n")
var sim_time = 1e-6 * (ts_end - ts_start)
c.printf("ELAPSED TIME = %7.3f s\n", sim_time)
var gflops =
helper.calculate_gflops(sim_time, WS * 6 + (WS - 1) * 4, 4, 4, conf)
c.printf("GFLOPS = %7.3f GFLOPS\n", gflops)
end
bishoplib.register_bishop_mappers()
regentlib.start(toplevel)
That's it, you've finished the simulation! (For real this time.) Hope you've enjoyed the exercise.