In [160]:
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import torch
import time
# hack to allow relative imports on Linux within 'celltypes' project
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
# torch defaults
torch.set_default_tensor_type('torch.DoubleTensor')
from singlecell.singlecell_simsetup import singlecell_simsetup
from singlecell.singlecell_simulate import singlecell_sim
In [161]:
print sys.version
print time.time()
In [162]:
# check cuda is working...
print "torch.cuda.current_device() - ", torch.cuda.current_device()
print "torch.cuda.device(0) - ", torch.cuda.device(0)
print "torch.cuda.device_count() - ", torch.cuda.device_count()
print "torch.cuda.get_device_name(0) - ", torch.cuda.get_device_name(0)
print "torch.cuda.is_available() - ", torch.cuda.is_available()
print "torch.cuda.memory_allocated() - ", torch.cuda.memory_allocated() * 1e-6, 'MB'
print "torch.cuda.memory_cached() - ", torch.cuda.memory_cached() * 1e-6, 'MB'
In [163]:
simsetup = singlecell_simsetup()
In [164]:
# TODO maybe add the gpu variants to the vanilla simsetup script, AND add a gpu flag to simsetup?
def copy_simsetup_arrays_to_gpu(simsetup, verbose=True):
if verbose:
print 'Init tensor mem on GPU (MB):', torch.cuda.memory_allocated() * 1e-6
simsetup['gpu_J'] = torch.from_numpy(simsetup['J']).cuda()
simsetup['gpu_XI'] = torch.from_numpy(simsetup['XI']).cuda()
simsetup['gpu_AINV'] = torch.from_numpy(simsetup['A_INV']).cuda()
if verbose:
print 'Final tensor mem on GPU (MB):', torch.cuda.memory_allocated() * 1e-6
return simsetup
print simsetup['J'].shape
simsetup = copy_simsetup_arrays_to_gpu(simsetup)
print simsetup['J'].shape # cpu numpy J and torch gpu J both in simsetup now
print type(simsetup['gpu_J']), simsetup['gpu_J'].shape, simsetup['gpu_J'].is_cuda
print type(simsetup['gpu_J'].cpu()), simsetup['gpu_J'].cpu().shape, simsetup['gpu_J'].cpu().is_cuda
In [165]:
t0 = time.time()
singlecell_sim(init_id='Macrophage (A)', field_protocol=None, plot_period=10,
iterations=50, simsetup=simsetup, flag_write=True, beta=2.2)
deltat = time.time() - t0
print "CPU timer:", deltat
In [170]:
from singlecell.singlecell_class import Cell
from singlecell.singlecell_constants import NUM_FULL_STEPS, BURST_ERROR_PERIOD, APP_FIELD_STRENGTH, EXT_FIELD_STRENGTH, \
BETA, ASYNC_BATCH
from singlecell.singlecell_data_io import run_subdir_setup, runinfo_append
from singlecell.singlecell_fields import field_setup
from singlecell.singlecell_simsetup import singlecell_simsetup, unpack_simsetup
from random import shuffle
# NOTE THIS IS THE ONLY THING THAT CHANGED -- benchmark vs deep if gpu else cpu just for this?
def gpu_internal_field(state, gene_idx, t, intxn_matrix):
"""
Original slow summation:
h_1 = 0
intxn_list = range(0, gene_idx) + range(gene_idx+1, N)
for j in intxn_list:
h_1 += J[gene_idx,j] * state[j,t] # plus some other field terms... do we care for these?
"""
# move state to gpu
gpu_state_vec_at_t = torch.from_numpy(state[:,t]).cuda()
# compute
gpu_internal_field = torch.dot(intxn_matrix[gene_idx,:], gpu_state_vec_at_t) # note diagonals assumed to be zero (enforce in J definition)
# send scalar back
internal_field = gpu_internal_field.item()
#internal_field = gpu_internal_field.cpu()
#print type(internal_field), internal_field
return internal_field
def gpu_glauber_dynamics_update(state, gene_idx, t, intxn_matrix, unirand, beta=BETA, ext_field=None, app_field=None,
ext_field_strength=EXT_FIELD_STRENGTH, app_field_strength=APP_FIELD_STRENGTH):
"""
unirand: pass a uniform 0,1 random number
- note previously unirand = random() OR unirand = np.random_intel.random() from intel python distribution
See page 107-111 Amit for discussion on functional form
ext_field - N x 1 - field external to the cell in a signalling sense; exosome field in multicell sym
ext_field_strength - scaling factor for ext_field
app_field - N x 1 - unnatural external field (e.g. force TF on for some time period experimentally)
app_field_strength - scaling factor for appt_field
"""
assert intxn_matrix.is_cuda
total_field = gpu_internal_field(state, gene_idx, t, intxn_matrix)
if ext_field is not None:
total_field += ext_field_strength * ext_field[gene_idx]
if app_field is not None:
total_field += app_field_strength * app_field[gene_idx]
prob_on_after_timestep = 1 / (1 + np.exp(-2*beta*total_field)) # probability that site i will be "up" after the timestep
#print "PRE state[gene_idx, t]", t, state[gene_idx, t], unirand, prob_on_after_timestep
if prob_on_after_timestep > unirand:
state[gene_idx, t] = 1.0
else:
state[gene_idx, t] = -1.0
#print "POST state[gene_idx, t]", t, state[gene_idx, t], unirand, total_field
return state
# TODO make class method in singlecell
def gpu_update_state(singlecell, intxn_matrix, ext_field=None, ext_field_strength=EXT_FIELD_STRENGTH,
beta=BETA, app_field=None, app_field_strength=APP_FIELD_STRENGTH, async_batch=ASYNC_BATCH):
"""
async_batch: if True, sample from 0 to N with replacement, else each step will be 'fully random'
i.e. can update same site twice in a row, vs time gap of at least N substeps
these produce different short term behaviour, but should reach same steady state
ext_field - N x 1 - field external to the cell in a signalling sense; exosome field in multicell sym
ext_field_strength - scaling factor for ext_field
app_field - N x 1 - unnatural external field (e.g. force TF on for some time period experimentally)
app_field_strength - scaling factor for appt_field
"""
assert intxn_matrix.is_cuda
sites = range(singlecell.N)
rsamples = np.random.rand(singlecell.N) # optimized: pass one to each of the N single spin update calls TODO: benchmark vs intels
if async_batch:
shuffle(sites) # randomize site ordering each timestep updates
else:
#sites = np.random.choice(self.N, self.N, replace=True)
#sites = [int(self.N*np.random.random()) for _ in xrange(self.N)] # this should be same and faster
sites = [int(singlecell.N * u) for u in np.random.rand(singlecell.N)] # this should be 5-10% percent faster
state_array_ext = np.zeros((singlecell.N, np.shape(singlecell.state_array)[1] + 1))
state_array_ext[:, :-1] = singlecell.state_array # TODO: make sure don't need array copy
state_array_ext[:,-1] = singlecell.state_array[:,-1]
for idx, site in enumerate(sites): # TODO: parallelize approximation
#print "PRE A", singlecell.steps + 1, state_array_ext[site, singlecell.steps + 1]
state_array_ext = gpu_glauber_dynamics_update(state_array_ext, site, singlecell.steps + 1, intxn_matrix, rsamples[idx],
beta=beta, ext_field=ext_field, app_field=app_field,
ext_field_strength=ext_field_strength,
app_field_strength=app_field_strength)
#print "POST A", singlecell.steps + 1, state_array_ext[site, singlecell.steps + 1]
singlecell.state_array = state_array_ext
singlecell.steps += 1
singlecell.state = state_array_ext[:, -1]
return singlecell
def gpu_synced_update_step(current_state, intxn_matrix, beta=BETA, ext_field=None, app_field=None,
ext_field_strength=EXT_FIELD_STRENGTH, app_field_strength=APP_FIELD_STRENGTH):
# copy state to gpu
N = current_state.shape[0]
gpu_current_state = torch.from_numpy(current_state).cuda()
# Step 1 - J x(t)
#print intxn_matrix.shape, gpu_current_state.shape
gpu_Jx = torch.mv(intxn_matrix, gpu_current_state)
# Step 2 - pointwise transform as 1/(1 + exp( -2 * beta * elem))
#gpu_transformed_Jx = torch.mul(gpu_Jx, -2.0*beta)
#gpu_transformed_Jx = torch.sigmoid(gpu_transformed_Jx)
gpu_transformed_Jx = torch.sigmoid(-2.0*beta*gpu_Jx)
# Step 3 - pointwise comparison to rsamples U[0,1] vector (if elem - u > 0, then its 1.0, else -1.0)
gpu_transformed_Jx = torch.add(gpu_transformed_Jx, -torch.cuda.DoubleTensor(N).uniform_())
# Step 4 - convert to boolean -1, 1 using torch.sign
gpu_state_vec_next = torch.sign(gpu_transformed_Jx)
# Step 5 - send back to cpu
state_vec_next = gpu_state_vec_next.cpu().numpy()
#print type(state_vec_next), state_vec_next.shape, N
return state_vec_next
def gpu_update_state_sync(singlecell, intxn_matrix, ext_field=None, ext_field_strength=EXT_FIELD_STRENGTH,
beta=BETA, app_field=None, app_field_strength=APP_FIELD_STRENGTH, async_batch=ASYNC_BATCH):
"""
BATCHED evolution i.e. synchronous...
this can be like pointwise op W on x(t+1) = W ( Jx(t) ), W like 1/(1+np.exp(-2 beta elem))
then compare W ( Jx(t) ) elements vs unirand 0,1 to return 1 pr -1 for x(t+1)
"""
assert intxn_matrix.is_cuda
assert app_field is None and ext_field is None
current_state = singlecell.state_array[:,-1]
# update state here
state_vec_next = gpu_synced_update_step(current_state, intxn_matrix, beta=beta, ext_field=ext_field,
app_field=app_field, ext_field_strength=ext_field_strength, app_field_strength=app_field_strength)
# copy extend state array
state_array_ext = np.zeros((singlecell.N, np.shape(singlecell.state_array)[1] + 1))
state_array_ext[:, :-1] = singlecell.state_array # TODO: make sure don't need array copy
state_array_ext[:,-1] = state_vec_next[:]
# update attributes
singlecell.state_array = state_array_ext
singlecell.steps += 1
singlecell.state = state_array_ext[:, -1]
return singlecell
# TODO flag in simsetup to use method update state or gpu update state is all we need? and simsetup with gpu copy optionally at top
def gpu_singlecell_sim(init_state=None, init_id=None, iterations=NUM_FULL_STEPS, beta=BETA, simsetup=None,
gpu_simsetup=None, field_protocol=None, field_level=None, flag_burst_error=False, flag_write=True,
analysis_subdir=None, plot_period=10, verbose=True):
"""
init_state: N x 1
init_id: None, or memory label like 'esc', or arbitrary label (e.g. 'All on')
iterations: main simulation loop duration
field_protocol: label for call field_setup to build field dict for applied field
flag_burst_error: if True, randomly flip some TFs at each BURST_ERROR_PERIOD (see ...constants.py)
flag_write: False only if want to avoid saving state to file
analysis_subdir: use to store data for non-standard runs
plot_period: period at which to plot cell state projection onto memory subspace
"""
# TODO: if dirs is None then do run subdir setup (just current run dir?)
# IO setup
if flag_write:
io_dict = run_subdir_setup(run_subfolder=analysis_subdir)
else:
if verbose:
print "Warning: flag_write set to False -- nothing will be saved"
io_dict = None
# simsetup unpack
if simsetup is None:
simsetup = singlecell_simsetup()
N, P, gene_labels, memory_labels, gene_id, celltype_id, xi, _, a_inv, intxn_matrix, _ = unpack_simsetup(simsetup)
gpu_intxn_matrix = simsetup['gpu_J']
# Cell setup
N = xi.shape[0]
if init_state is None:
if init_id is None:
init_id = "All_on"
init_state = 1 + np.zeros(N) # start with all genes on
else:
init_state = xi[:, celltype_id[init_id]]
singlecell = Cell(init_state, init_id, memories_list=memory_labels, gene_list=gene_labels)
# Input checks
field_dict = field_setup(simsetup, protocol=field_protocol, level=field_level)
assert not field_dict['time_varying'] # TODO not yet supported
app_field = field_dict['app_field']
app_field_strength = field_dict['app_field_strength']
# Simulate
for step in xrange(iterations-1):
if verbose:
print "cell steps:", singlecell.steps, " H(state) =", singlecell.get_energy(intxn_matrix=intxn_matrix) # TODO need general intxn_matrix parent class
# apply burst errors
if flag_burst_error and step % BURST_ERROR_PERIOD == 0:
singlecell.apply_burst_errors()
# prep applied field TODO see if better speed to pass array of zeros and ditch all these if not None checks...
if flag_write:
if singlecell.steps % plot_period == 0:
fig, ax, proj = singlecell.plot_projection(a_inv, xi, use_radar=True, pltdir=io_dict['plotdatadir'])
fig, ax, proj = singlecell.plot_overlap(xi, use_radar=True, pltdir=io_dict['plotdatadir'])
#singlecell = gpu_update_state(singlecell, gpu_intxn_matrix, beta=beta, app_field=app_field, app_field_strength=app_field_strength, async_batch=ASYNC_BATCH)
singlecell = gpu_update_state_sync(singlecell, gpu_intxn_matrix, beta=beta, app_field=app_field, app_field_strength=app_field_strength, async_batch=ASYNC_BATCH)
# Write
if verbose:
print singlecell.get_current_state()
if flag_write:
if verbose:
print "Writing state to file.."
singlecell.write_state(io_dict['datadir'])
if verbose:
print io_dict['basedir']
print "Done"
return singlecell.get_state_array(), io_dict
In [171]:
t0 = time.time()
gpu_singlecell_sim(init_id='Macrophage (A)', field_protocol=None, plot_period=10,
iterations=50, simsetup=simsetup, flag_write=True, beta=2.2)
deltat = time.time() - t0
print "GPU timer:", deltat
In [44]:
# large random matrix dot product
n = 5000
m = 2000
local_A = np.random.randn(n,m).astype('float32')
local_B = np.random.randn(n,m).astype('float32')
t0 = time.time()
local_dot = np.dot(local_A.T, local_B)
tdelta = time.time() - t0
print "Expect shape of A*B^T to be m x m:", local_dot.shape
print "First few elements:\n", local_dot[0:2, 0:2]
print "Time:", tdelta, '\n'
# now send data to gpu, compute, and return
print "Trying vector dot product on GPU..."
# STEP 1 - convert numpy to pytorch tensor
t0 = time.time()
torch_A = torch.from_numpy(local_A)
torch_B = torch.from_numpy(local_B)
print "Step 1: time =", time.time() - t0
print "Step 1: types are", type(local_A), type(torch_B)
# STEP 2 - send to gpu
t0 = time.time()
gpu_A = torch_A.cuda()
gpu_B = torch_B.cuda()
print "Step 2: time =", time.time() - t0
print "Step 2: type is", type(gpu_A)
# STEP 3 - compute
t0 = time.time()
gpu_dot = torch.mm(gpu_A.t(), gpu_B)
print "Step 3: time =", time.time() - t0
# STEP 4 - send back
t0 = time.time()
torch_A = gpu_A.cpu()
torch_B = gpu_B.cpu()
torch_dot = gpu_dot.cpu()
print "Step 4: time =", time.time() - t0
print "GPU: Expect shape of A*B^T to be m x m:", torch_dot.shape
print "First few elements:\n", torch_dot[0:2, 0:2]
# and just cpu pytorch vs numpy
t0 = time.time()
torch_dot = torch.mm(torch_A.t(), torch_B)
print "\nPyTorch non-cuda timing: time =", time.time() - t0
print "Torch CPU: Expect shape of A*B^T to be m x m:", torch_dot.shape
print "First few elements:\n", torch_dot[0:2, 0:2]
print "\nGPU info:"
print "torch.cuda.memory_allocated()", torch.cuda.memory_allocated()
print "torch.cuda.memory_cached()", torch.cuda.memory_cached()
In [148]:
N = 1000
t0 = time.time()
b = torch.cuda.FloatTensor(N).uniform_()
c = 4*torch.mul(b, -1.0)
print c.data[0:5]
print time.time()-t0
t0 = time.time()
a = torch.rand(N, 1)
b = a.cuda()
c = torch.mul(b, -1.0)
print time.time()-t0
In [ ]: