Session 3 Part 2

In the previous part, you might have seen some (but small) performance differences between different mapping decisions. Those differences of course will become more pronounced once the problem size you run your program gets bigger. The goal of this exercise is to explore different mapping choices and figure out which mappings perform the best and why.

We give you in this part of the exercise a bit bigger problem with 320K nodes and 1.3M wires in 128 circuit pieces. The number of time steps for each iteration has also changed from 10K steps to 100K steps. With this setting, your task of this exercise is:

Find the best combination of mappings of tasks you can find.

As a starting point, we are giving you mapping rules that map the three simulation tasks to CPUs and map their regions to the system memory. For your information, the system you are going to use has four nodes each of which has 16 CPUs and 4 GPUs. We allocated each of them a big enough memory of each kind (system, RDMA, GPU framebuffer, and GPU zero-copy memory), feel free to map your regions on different memories as you wish.

The syntax guide from the preivous part will be useful, so we give it below as well.

Syntax Guide



In [ ]:

    
__demand(__cuda) task T ... -- Generates both x86 and CUDA variants for task T
bishop ... end              -- Starts a bishop mapper
TE    { target : V; }       -- Sets value V as the target of a task that matches TE
TE RE { target : V; }       -- Sets value V as the target of a region that matches RE and whose task matches TE

-- Task Element (TE)
task                        -- Selects any tasks
task#T                      -- Selects tasks named T
task[isa=I]                 -- Selects tasks mapped to a processor that supports ISA I
TE[target=$T]               -- Selects tasks that satisfy TE and then binds their target to $T
TE[index=$P]                -- Selects tasks that satisfy TE and then binds their point in the launch domain to $P

-- Region Element (RE)
region                      -- Selects any regions
region#P                    -- Selects regions named P in the signature

-- Processor objects
processors                  -- A list of processors in the whole system
processors[isa=I]           -- A list of processors that support ISA I (either x86 or cuda)
processors[N]               -- The N-th processor in the list
L.size                      -- The size of list L of processors
P.memories                  -- A list of memories visible to processor P

-- Memory objects
memories                    -- A list of memories in the whole system
memories[kind=K]            -- A list of memories of kind K (sysmem, regmem, fbmem, or zcmem)
memories[N]                 -- The N-th memory in the list
L.size                      -- The size of list L of memories

-- Expressions for list indices
$P[0]                       -- The first coordinate of point $P
E1 + E2, E1 - E2, E1 * E2, E1 / E2, E1 % E2 -- Usual integer arithmetic expressions

Exercise



In [ ]:

    
import "regent"
import "bishop"

local c = regentlib.c

struct Currents {
  _0 : float,
  _1 : float,
  _2 : float,
}

struct Voltages {
  _1 : float,
  _2 : float,
}

fspace Node {
  capacitance : float,
  leakage     : float,
  charge      : float,
  voltage     : float,
}

fspace Wire(rpn : region(Node), rsn : region(Node), rgn : region(Node)) {
  in_node     : ptr(Node, rpn, rsn),
  out_node    : ptr(Node, rpn, rsn, rgn),
  inductance  : float,
  resistance  : float,
  capacitance : float,
  current     : Currents,
  voltage     : Voltages,
}

local CktConfig = require("session3/circuit_config")
local helper = require("session3/circuit_helper")

local WS = 3
local dT = 1e-7

mapper

task#calculate_new_currents[index=$p],
task#distribute_charge[index=$p],
task#update_voltages[index=$p]
{
  target : processors[isa=x86][$p[0] % processors[isa=x86].size];
}

task[isa=x86 and target=$proc] region
{
  target : $proc.memories[kind=sysmem];
}

end

__demand(__cuda)
task calculate_new_currents(steps : uint,
                            rpn : region(Node),
                            rsn : region(Node),
                            rgn : region(Node),
                            rw : region(Wire(rpn, rsn, rgn)))
where
  reads(rpn.voltage, rsn.voltage, rgn.voltage,
        rw.{in_node, out_node, inductance, resistance, capacitance}),
  reads writes(rw.{current, voltage})
do
  var rdT : float = 1.0 / dT
  __demand(__vectorize)
  for w in rw do
    var temp_v : float[WS + 1]
    var temp_i : float[WS]
    var old_i : float[WS]
    var old_v : float[WS - 1]

    temp_i[0] = w.current._0
    temp_i[1] = w.current._1
    temp_i[2] = w.current._2
    for i = 0, WS do old_i[i] = temp_i[i] end

    temp_v[1] = w.voltage._1
    temp_v[2] = w.voltage._2
    for i = 0, WS - 1 do old_v[i] = temp_v[i + 1] end

    -- Pin the outer voltages to the node voltages.
    temp_v[0] = w.in_node.voltage
    temp_v[WS] = w.out_node.voltage

    -- Solve the RLC model iteratively.
    var L : float = w.inductance
    var rR : float = 1.0 / w.resistance
    var rC : float = 1.0 / w.capacitance
    for j = 0, steps do
      -- First, figure out the new current from the voltage differential
      -- and our inductance:
      -- dV = R*I + L*I' ==> I = (dV - L*I')/R
      for i = 0, WS do
        temp_i[i] = ((temp_v[i + 1] - temp_v[i]) -
                     (L * (temp_i[i] - old_i[i]) * rdT)) * rR
      end
      -- Now update the inter-node voltages.
      for i = 0, WS - 1 do
        temp_v[i + 1] = old_v[i] + dT * (temp_i[i] - temp_i[i + 1]) * rC
      end
    end

    -- Write out the results.
    w.current._0 = temp_i[0]
    w.current._1 = temp_i[1]
    w.current._2 = temp_i[2]

    w.voltage._1 = temp_v[1]
    w.voltage._2 = temp_v[2]
  end
end

__demand(__cuda)
task distribute_charge(rpn : region(Node),
                       rsn : region(Node),
                       rgn : region(Node),
                       rw : region(Wire(rpn, rsn, rgn)))
where
  reads(rw.{in_node, out_node, current._0, current._2}),
  reduces +(rpn.charge, rsn.charge, rgn.charge)
do
  for w in rw do
    var in_current = -dT * w.current._0
    var out_current = dT * w.current._2
    w.in_node.charge += in_current
    w.out_node.charge += out_current
  end
end

__demand(__cuda)
task update_voltages(rn : region(Node))
where
  reads(rn.{capacitance, leakage}),
  reads writes(rn.{voltage, charge})
do
  for n in rn do
    var voltage = n.voltage + n.charge / n.capacitance
    voltage = voltage * (1.0 - n.leakage)
    n.voltage = voltage
    n.charge = 0.0
  end
end

task toplevel()
  var conf : CktConfig
  conf:initialize_from_command()
  conf:show()

  var num_circuit_nodes = conf.num_pieces * conf.nodes_per_piece
  var num_circuit_wires = conf.num_pieces * conf.wires_per_piece

  var rn = region(ispace(ptr, num_circuit_nodes), Node)
  var rw = region(ispace(ptr, num_circuit_wires), Wire(wild, wild, wild))

  new(ptr(Node, rn), num_circuit_nodes)
  new(ptr(Wire(wild, wild, wild), rw), num_circuit_wires)

  c.printf("Generating a random circuit...\n")
  helper.generate_random_circuit(rn, rw, conf)

  var colors = ispace(int1d, conf.num_pieces)
  var pn_equal = partition(equal, rn, colors)
  var pw = preimage(rw, pn_equal, rw.in_node)
  var pn_extrefs = image(rn, preimage(rw, pn_equal, rw.out_node) - pw, rw.out_node)
  var pn_private = pn_equal - pn_extrefs
  var pn_shared = pn_equal & pn_extrefs
  var pn_ghost = image(rn, pw, rw.out_node) - pn_equal

  __demand(__parallel)
  for i = 0, conf.num_pieces do
    helper.initialize_pointers(pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
  end

  helper.wait_for(helper.block(rn, rw))

  c.printf("Starting main simulation loop\n")
  var ts_start = helper.timestamp()

  for j = 0, conf.num_loops do
    for i = 0, conf.num_pieces do
      calculate_new_currents(conf.steps, pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
    end
    for i = 0, conf.num_pieces do
      distribute_charge(pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
    end
    for i = 0, conf.num_pieces do
      update_voltages(pn_equal[i])
    end
  end

  -- Wait for all previous tasks to complete and measure the elapsed time.
  var _ = 0
  for i = 0, conf.num_pieces do
    _ += helper.block(pn_equal[i], pw[i])
  end
  helper.wait_for(_)
  var ts_end = helper.timestamp()
  c.printf("simulation complete\n")

  var sim_time = 1e-6 * (ts_end - ts_start)
  c.printf("ELAPSED TIME = %7.3f s\n", sim_time)
  var gflops =
    helper.calculate_gflops(sim_time, WS * 6 + (WS - 1) * 4, 4, 4, conf)
  c.printf("GFLOPS = %7.3f GFLOPS\n", gflops)
end
bishoplib.register_bishop_mappers()
regentlib.start(toplevel)

That's it, you've finished the simulation! (For real this time.) Hope you've enjoyed the exercise.