In [160]:
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import torch
import time

# hack to allow relative imports on Linux within 'celltypes' project
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# torch defaults
torch.set_default_tensor_type('torch.DoubleTensor')
    
from singlecell.singlecell_simsetup import singlecell_simsetup
from singlecell.singlecell_simulate import singlecell_sim

System checks


In [161]:
print sys.version
print time.time()


2.7.15rc1 (default, Nov 12 2018, 14:31:15) 
[GCC 7.3.0]
1547316995.31

In [162]:
# check cuda is working...
print "torch.cuda.current_device()     - ", torch.cuda.current_device()
print "torch.cuda.device(0)            - ", torch.cuda.device(0)
print "torch.cuda.device_count()       - ", torch.cuda.device_count()
print "torch.cuda.get_device_name(0)   - ", torch.cuda.get_device_name(0)
print "torch.cuda.is_available()       - ", torch.cuda.is_available()
print "torch.cuda.memory_allocated()   - ", torch.cuda.memory_allocated() * 1e-6, 'MB'
print "torch.cuda.memory_cached()      - ", torch.cuda.memory_cached() * 1e-6, 'MB'


torch.cuda.current_device()     -  0
torch.cuda.device(0)            -  <torch.cuda.device object at 0x7f9a740bbc50>
torch.cuda.device_count()       -  1
torch.cuda.get_device_name(0)   -  GeForce GTX 1080
torch.cuda.is_available()       -  True
torch.cuda.memory_allocated()   -  125.024768 MB
torch.cuda.memory_cached()      -  288.620544 MB

Simsetup on CPU + naive GPU copy


In [163]:
simsetup = singlecell_simsetup()


loading npz of arr genes cells at /media/homes/msmart/Development/repos/biomodels/celltypes/input/memories/2018_scmca_mems_genes_types_boolean_compressed_pruned_A_TFonly.npz ...
loaded arr, genes, cells: (2218, 98) (2218,) (98,)
Note network method for interaction_matrix() is projection

In [164]:
# TODO maybe add the gpu variants to the vanilla simsetup script, AND add a gpu flag to simsetup?
def copy_simsetup_arrays_to_gpu(simsetup, verbose=True):
    if verbose:
        print 'Init tensor mem on GPU (MB):', torch.cuda.memory_allocated() * 1e-6
    simsetup['gpu_J'] = torch.from_numpy(simsetup['J']).cuda()
    simsetup['gpu_XI'] = torch.from_numpy(simsetup['XI']).cuda()
    simsetup['gpu_AINV'] = torch.from_numpy(simsetup['A_INV']).cuda()
    if verbose:
        print 'Final tensor mem on GPU (MB):', torch.cuda.memory_allocated() * 1e-6
    return simsetup

print simsetup['J'].shape
simsetup = copy_simsetup_arrays_to_gpu(simsetup)
print simsetup['J'].shape  # cpu numpy J and torch gpu J both in simsetup now
print type(simsetup['gpu_J']), simsetup['gpu_J'].shape, simsetup['gpu_J'].is_cuda
print type(simsetup['gpu_J'].cpu()), simsetup['gpu_J'].cpu().shape, simsetup['gpu_J'].cpu().is_cuda


(2218, 2218)
Init tensor mem on GPU (MB): 125.024768
Final tensor mem on GPU (MB): 166.38976
(2218, 2218)
<class 'torch.Tensor'> torch.Size([2218, 2218]) True
<class 'torch.Tensor'> torch.Size([2218, 2218]) False

Singlecell basic simulation: CPU vs GPU


In [165]:
t0 = time.time()
singlecell_sim(init_id='Macrophage (A)', field_protocol=None, plot_period=10,
               iterations=50, simsetup=simsetup, flag_write=True, beta=2.2)
deltat = time.time() - t0
print "CPU timer:", deltat


Warning: Arg 'level' is None -- setting field level to 'level_1'
cell steps: 0  H(state) = -1059.9999999999943
cell steps: 1  H(state) = -974.2196935041254
cell steps: 2  H(state) = -960.9644347041556
cell steps: 3  H(state) = -988.3584948319874
cell steps: 4  H(state) = -992.9954416400026
cell steps: 5  H(state) = -996.4359518841029
cell steps: 6  H(state) = -998.6586718111878
cell steps: 7  H(state) = -991.6673165119836
cell steps: 8  H(state) = -982.7363902064856
cell steps: 9  H(state) = -1007.743858488006
cell steps: 10  H(state) = -961.9471730608138
cell steps: 11  H(state) = -978.2490788581464
cell steps: 12  H(state) = -997.9018155039371
cell steps: 13  H(state) = -988.0143629462646
cell steps: 14  H(state) = -999.6240145299898
cell steps: 15  H(state) = -972.2613485073005
cell steps: 16  H(state) = -1011.6976157578556
cell steps: 17  H(state) = -1001.9636820507197
cell steps: 18  H(state) = -997.026634931203
cell steps: 19  H(state) = -976.1731057783068
cell steps: 20  H(state) = -978.6644211110868
cell steps: 21  H(state) = -969.8821013896134
cell steps: 22  H(state) = -982.1489327326905
cell steps: 23  H(state) = -971.9021344692267
cell steps: 24  H(state) = -1007.2857609143946
cell steps: 25  H(state) = -995.3502458195383
cell steps: 26  H(state) = -980.2629765025928
cell steps: 27  H(state) = -992.5000358570352
cell steps: 28  H(state) = -1005.0352298241212
cell steps: 29  H(state) = -1008.0816032829631
cell steps: 30  H(state) = -994.4856089877595
cell steps: 31  H(state) = -1000.0195279470543
cell steps: 32  H(state) = -983.8600705813185
cell steps: 33  H(state) = -1010.8870372913343
cell steps: 34  H(state) = -973.5056399052555
cell steps: 35  H(state) = -994.5273018464225
cell steps: 36  H(state) = -980.8045442482678
cell steps: 37  H(state) = -987.4773630063178
cell steps: 38  H(state) = -995.7572140557327
cell steps: 39  H(state) = -960.651194070103
cell steps: 40  H(state) = -980.9814718452701
cell steps: 41  H(state) = -984.9467403893557
cell steps: 42  H(state) = -970.5217836704435
cell steps: 43  H(state) = -972.5211026726819
cell steps: 44  H(state) = -987.5143742238672
cell steps: 45  H(state) = -988.4657353064161
cell steps: 46  H(state) = -1004.8012775653606
cell steps: 47  H(state) = -970.0983323429632
cell steps: 48  H(state) = -1006.768573229293
[-1. -1. -1. ...  1.  1.  1.]
Writing state to file..
/media/homes/msmart/Development/repos/biomodels/celltypes/singlecell/runs/2019-01-12 01.16.46PM
Done
CPU timer: 12.5117371082

In [170]:
from singlecell.singlecell_class import Cell
from singlecell.singlecell_constants import NUM_FULL_STEPS, BURST_ERROR_PERIOD, APP_FIELD_STRENGTH, EXT_FIELD_STRENGTH, \
                                            BETA, ASYNC_BATCH
from singlecell.singlecell_data_io import run_subdir_setup, runinfo_append
from singlecell.singlecell_fields import field_setup
from singlecell.singlecell_simsetup import singlecell_simsetup, unpack_simsetup

from random import shuffle


# NOTE THIS IS THE ONLY THING THAT CHANGED -- benchmark vs deep if gpu else cpu just for this?
def gpu_internal_field(state, gene_idx, t, intxn_matrix):
    """
    Original slow summation:
    h_1 = 0
    intxn_list = range(0, gene_idx) + range(gene_idx+1, N)
    for j in intxn_list:
        h_1 += J[gene_idx,j] * state[j,t]  # plus some other field terms... do we care for these?
    """
    # move state to gpu
    gpu_state_vec_at_t = torch.from_numpy(state[:,t]).cuda()
    # compute
    gpu_internal_field = torch.dot(intxn_matrix[gene_idx,:], gpu_state_vec_at_t)  # note diagonals assumed to be zero (enforce in J definition)
    # send scalar back
    internal_field = gpu_internal_field.item()
    #internal_field = gpu_internal_field.cpu()
    #print type(internal_field), internal_field
    return internal_field


def gpu_glauber_dynamics_update(state, gene_idx, t, intxn_matrix, unirand, beta=BETA, ext_field=None, app_field=None,
                                ext_field_strength=EXT_FIELD_STRENGTH, app_field_strength=APP_FIELD_STRENGTH):
    """
    unirand: pass a uniform 0,1 random number
        - note previously unirand = random() OR unirand = np.random_intel.random() from intel python distribution
    See page 107-111 Amit for discussion on functional form
    ext_field - N x 1 - field external to the cell in a signalling sense; exosome field in multicell sym
    ext_field_strength  - scaling factor for ext_field
    app_field - N x 1 - unnatural external field (e.g. force TF on for some time period experimentally)
    app_field_strength - scaling factor for appt_field
    """
    assert intxn_matrix.is_cuda
    total_field = gpu_internal_field(state, gene_idx, t, intxn_matrix)
    if ext_field is not None:
        total_field += ext_field_strength * ext_field[gene_idx]
    if app_field is not None:
        total_field += app_field_strength * app_field[gene_idx]
    prob_on_after_timestep = 1 / (1 + np.exp(-2*beta*total_field))  # probability that site i will be "up" after the timestep
    #print  "PRE state[gene_idx, t]", t, state[gene_idx, t], unirand, prob_on_after_timestep
    if prob_on_after_timestep > unirand:
        state[gene_idx, t] = 1.0
    else:
        state[gene_idx, t] = -1.0
    #print  "POST state[gene_idx, t]", t, state[gene_idx, t], unirand, total_field
    return state


# TODO make class method in singlecell
def gpu_update_state(singlecell, intxn_matrix, ext_field=None, ext_field_strength=EXT_FIELD_STRENGTH, 
                     beta=BETA, app_field=None, app_field_strength=APP_FIELD_STRENGTH, async_batch=ASYNC_BATCH):
    """
    async_batch: if True, sample from 0 to N with replacement, else each step will be 'fully random'
                 i.e. can update same site twice in a row, vs time gap of at least N substeps
                 these produce different short term behaviour, but should reach same steady state
    ext_field - N x 1 - field external to the cell in a signalling sense; exosome field in multicell sym
    ext_field_strength  - scaling factor for ext_field
    app_field - N x 1 - unnatural external field (e.g. force TF on for some time period experimentally)
    app_field_strength - scaling factor for appt_field
    """
    assert intxn_matrix.is_cuda

    sites = range(singlecell.N)
    rsamples = np.random.rand(singlecell.N)  # optimized: pass one to each of the N single spin update calls  TODO: benchmark vs intels
    if async_batch:
        shuffle(sites)  # randomize site ordering each timestep updates
    else:
        #sites = np.random.choice(self.N, self.N, replace=True)
        #sites = [int(self.N*np.random.random()) for _ in xrange(self.N)]  # this should be same and faster
        sites = [int(singlecell.N * u) for u in np.random.rand(singlecell.N)]  # this should be 5-10% percent faster

    state_array_ext = np.zeros((singlecell.N, np.shape(singlecell.state_array)[1] + 1))
    state_array_ext[:, :-1] = singlecell.state_array  # TODO: make sure don't need array copy
    state_array_ext[:,-1] = singlecell.state_array[:,-1]
    for idx, site in enumerate(sites):          # TODO: parallelize approximation
        #print "PRE A", singlecell.steps + 1, state_array_ext[site, singlecell.steps + 1]
        state_array_ext = gpu_glauber_dynamics_update(state_array_ext, site, singlecell.steps + 1, intxn_matrix, rsamples[idx],
                                                      beta=beta, ext_field=ext_field, app_field=app_field,
                                                      ext_field_strength=ext_field_strength,
                                                      app_field_strength=app_field_strength)
        #print "POST A", singlecell.steps + 1, state_array_ext[site, singlecell.steps + 1]
    singlecell.state_array = state_array_ext
    singlecell.steps += 1
    singlecell.state = state_array_ext[:, -1]
    return singlecell


def gpu_synced_update_step(current_state, intxn_matrix, beta=BETA, ext_field=None, app_field=None,
                           ext_field_strength=EXT_FIELD_STRENGTH, app_field_strength=APP_FIELD_STRENGTH):
    # copy state to gpu
    N = current_state.shape[0]
    gpu_current_state = torch.from_numpy(current_state).cuda()
    # Step 1 - J x(t)
    #print intxn_matrix.shape, gpu_current_state.shape
    gpu_Jx = torch.mv(intxn_matrix, gpu_current_state)
    # Step 2 - pointwise transform as 1/(1 + exp( -2 * beta * elem))
    #gpu_transformed_Jx = torch.mul(gpu_Jx, -2.0*beta)
    #gpu_transformed_Jx = torch.sigmoid(gpu_transformed_Jx)
    gpu_transformed_Jx = torch.sigmoid(-2.0*beta*gpu_Jx)
    
    # Step 3 - pointwise comparison to rsamples U[0,1] vector (if elem - u > 0, then its 1.0, else -1.0)
    gpu_transformed_Jx = torch.add(gpu_transformed_Jx, -torch.cuda.DoubleTensor(N).uniform_())
    # Step 4 - convert to boolean -1, 1 using torch.sign
    gpu_state_vec_next = torch.sign(gpu_transformed_Jx)
    # Step 5 - send back to cpu
    state_vec_next = gpu_state_vec_next.cpu().numpy()
    #print type(state_vec_next), state_vec_next.shape, N
    return state_vec_next


def gpu_update_state_sync(singlecell, intxn_matrix, ext_field=None, ext_field_strength=EXT_FIELD_STRENGTH, 
                          beta=BETA, app_field=None, app_field_strength=APP_FIELD_STRENGTH, async_batch=ASYNC_BATCH):
    """
    BATCHED evolution i.e. synchronous...
    this can be like pointwise op W on x(t+1) = W ( Jx(t) ), W like 1/(1+np.exp(-2 beta elem))
    then compare W ( Jx(t) ) elements vs unirand 0,1 to return 1 pr -1 for x(t+1)
    """
    assert intxn_matrix.is_cuda
    assert app_field is None and ext_field is None

    current_state = singlecell.state_array[:,-1]
    # update state here
    state_vec_next = gpu_synced_update_step(current_state, intxn_matrix, beta=beta, ext_field=ext_field, 
                                            app_field=app_field, ext_field_strength=ext_field_strength, app_field_strength=app_field_strength)
    # copy extend state array
    state_array_ext = np.zeros((singlecell.N, np.shape(singlecell.state_array)[1] + 1))
    state_array_ext[:, :-1] = singlecell.state_array  # TODO: make sure don't need array copy
    state_array_ext[:,-1] = state_vec_next[:]
    # update attributes
    singlecell.state_array = state_array_ext
    singlecell.steps += 1
    singlecell.state = state_array_ext[:, -1]
    
    return singlecell

    

# TODO flag in simsetup to use method update state or gpu update state is all we need? and simsetup with gpu copy optionally at top
def gpu_singlecell_sim(init_state=None, init_id=None, iterations=NUM_FULL_STEPS, beta=BETA, simsetup=None,
                       gpu_simsetup=None, field_protocol=None, field_level=None, flag_burst_error=False, flag_write=True,
                       analysis_subdir=None, plot_period=10, verbose=True):
    """
    init_state: N x 1
    init_id: None, or memory label like 'esc', or arbitrary label (e.g. 'All on')
    iterations: main simulation loop duration
    field_protocol: label for call field_setup to build field dict for applied field
    flag_burst_error: if True, randomly flip some TFs at each BURST_ERROR_PERIOD (see ...constants.py)
    flag_write: False only if want to avoid saving state to file
    analysis_subdir: use to store data for non-standard runs
    plot_period: period at which to plot cell state projection onto memory subspace
    """
    # TODO: if dirs is None then do run subdir setup (just current run dir?)
    # IO setup
    if flag_write:
        io_dict = run_subdir_setup(run_subfolder=analysis_subdir)
    else:
        if verbose:
            print "Warning: flag_write set to False -- nothing will be saved"
        io_dict = None

    # simsetup unpack
    if simsetup is None:
        simsetup = singlecell_simsetup()
    N, P, gene_labels, memory_labels, gene_id, celltype_id, xi, _, a_inv, intxn_matrix, _ = unpack_simsetup(simsetup)
    gpu_intxn_matrix = simsetup['gpu_J']
    
    # Cell setup
    N = xi.shape[0]
    if init_state is None:
        if init_id is None:
            init_id = "All_on"
            init_state = 1 + np.zeros(N)  # start with all genes on
        else:
            init_state = xi[:, celltype_id[init_id]]
    singlecell = Cell(init_state, init_id, memories_list=memory_labels, gene_list=gene_labels)

    # Input checks
    field_dict = field_setup(simsetup, protocol=field_protocol, level=field_level)
    assert not field_dict['time_varying']  # TODO not yet supported
    app_field = field_dict['app_field']
    app_field_strength = field_dict['app_field_strength']

    # Simulate
    for step in xrange(iterations-1):
        if verbose:
            print "cell steps:", singlecell.steps, " H(state) =", singlecell.get_energy(intxn_matrix=intxn_matrix)  # TODO need general intxn_matrix parent class
        # apply burst errors
        if flag_burst_error and step % BURST_ERROR_PERIOD == 0:
            singlecell.apply_burst_errors()
        # prep applied field TODO see if better speed to pass array of zeros and ditch all these if not None checks...
        if flag_write:
            if singlecell.steps % plot_period == 0:
                fig, ax, proj = singlecell.plot_projection(a_inv, xi, use_radar=True, pltdir=io_dict['plotdatadir'])
                fig, ax, proj = singlecell.plot_overlap(xi, use_radar=True, pltdir=io_dict['plotdatadir'])
        #singlecell = gpu_update_state(singlecell, gpu_intxn_matrix, beta=beta, app_field=app_field, app_field_strength=app_field_strength, async_batch=ASYNC_BATCH)
        singlecell = gpu_update_state_sync(singlecell, gpu_intxn_matrix, beta=beta, app_field=app_field, app_field_strength=app_field_strength, async_batch=ASYNC_BATCH)

    # Write
    if verbose:
        print singlecell.get_current_state()
    if flag_write:
        if verbose:
            print "Writing state to file.."
        singlecell.write_state(io_dict['datadir'])
    if verbose:
        print io_dict['basedir']
        print "Done"
    return singlecell.get_state_array(), io_dict

In [171]:
t0 = time.time()
gpu_singlecell_sim(init_id='Macrophage (A)', field_protocol=None, plot_period=10,
               iterations=50, simsetup=simsetup, flag_write=True, beta=2.2)
deltat = time.time() - t0
print "GPU timer:", deltat


Warning: Arg 'level' is None -- setting field level to 'level_1'
cell steps: 0  H(state) = -1059.9999999999943
cell steps: 1  H(state) = -999.2018245944055
cell steps: 2  H(state) = -995.0844336955593
cell steps: 3  H(state) = -955.211442761804
cell steps: 4  H(state) = -976.683732362431
cell steps: 5  H(state) = -994.9553650369086
cell steps: 6  H(state) = -985.2867072890498
cell steps: 7  H(state) = -975.7213998629531
cell steps: 8  H(state) = -981.7208086571686
cell steps: 9  H(state) = -976.6101500962332
cell steps: 10  H(state) = -981.0348275653338
cell steps: 11  H(state) = -970.7078200635472
cell steps: 12  H(state) = -973.5016925959127
cell steps: 13  H(state) = -983.4318256447925
cell steps: 14  H(state) = -1007.7228141418176
cell steps: 15  H(state) = -989.883352564922
cell steps: 16  H(state) = -999.3357962311004
cell steps: 17  H(state) = -985.6202484103844
cell steps: 18  H(state) = -998.4487147881733
cell steps: 19  H(state) = -983.0117038660318
cell steps: 20  H(state) = -1000.4463499114033
cell steps: 21  H(state) = -1000.5776027586953
cell steps: 22  H(state) = -966.0172843514315
cell steps: 23  H(state) = -991.5406311913694
cell steps: 24  H(state) = -1000.9934206881538
cell steps: 25  H(state) = -1000.1002604658183
cell steps: 26  H(state) = -978.1437445049978
cell steps: 27  H(state) = -995.9681321944468
cell steps: 28  H(state) = -991.8317160458125
cell steps: 29  H(state) = -988.4919647377951
cell steps: 30  H(state) = -982.0308977985387
cell steps: 31  H(state) = -983.4499937090552
cell steps: 32  H(state) = -995.7144525226918
cell steps: 33  H(state) = -1005.568799410554
cell steps: 34  H(state) = -981.6740019506368
cell steps: 35  H(state) = -1005.7267331059462
cell steps: 36  H(state) = -998.3532333742645
cell steps: 37  H(state) = -986.3277164308807
cell steps: 38  H(state) = -985.2363298322124
cell steps: 39  H(state) = -975.6337152332633
cell steps: 40  H(state) = -989.818882363902
cell steps: 41  H(state) = -994.0006499824997
cell steps: 42  H(state) = -994.4500107116044
cell steps: 43  H(state) = -989.9473761163132
cell steps: 44  H(state) = -987.0688903738037
cell steps: 45  H(state) = -959.445194001775
cell steps: 46  H(state) = -988.0446096958948
cell steps: 47  H(state) = -991.3245895460578
cell steps: 48  H(state) = -985.4135506643775
[ 1.  1.  1. ... -1. -1. -1.]
Writing state to file..
/media/homes/msmart/Development/repos/biomodels/celltypes/singlecell/runs/2019-01-12 01.18.02PM
Done
GPU timer: 11.0539469719

Matrix product example


In [44]:
# large random matrix dot product
n = 5000
m = 2000
local_A = np.random.randn(n,m).astype('float32')
local_B = np.random.randn(n,m).astype('float32')
t0 = time.time()
local_dot = np.dot(local_A.T, local_B)
tdelta = time.time() - t0
print "Expect shape of A*B^T to be m x m:", local_dot.shape
print "First few elements:\n", local_dot[0:2, 0:2]
print "Time:", tdelta, '\n'

# now send data to gpu, compute, and return
print "Trying vector dot product on GPU..."
# STEP 1 - convert numpy to pytorch tensor
t0 = time.time()
torch_A = torch.from_numpy(local_A)
torch_B = torch.from_numpy(local_B)
print "Step 1: time =", time.time() - t0
print "Step 1: types are", type(local_A), type(torch_B)
# STEP 2 - send to gpu
t0 = time.time()
gpu_A = torch_A.cuda()
gpu_B = torch_B.cuda()
print "Step 2: time =", time.time() - t0
print "Step 2: type is", type(gpu_A)
# STEP 3 - compute
t0 = time.time()
gpu_dot = torch.mm(gpu_A.t(), gpu_B)
print "Step 3: time =", time.time() - t0
# STEP 4 - send back
t0 = time.time()
torch_A = gpu_A.cpu()
torch_B = gpu_B.cpu()
torch_dot = gpu_dot.cpu()
print "Step 4: time =", time.time() - t0
print "GPU: Expect shape of A*B^T to be m x m:", torch_dot.shape
print "First few elements:\n", torch_dot[0:2, 0:2]

# and just cpu pytorch vs numpy
t0 = time.time()
torch_dot = torch.mm(torch_A.t(), torch_B)
print "\nPyTorch non-cuda timing: time =", time.time() - t0
print "Torch CPU: Expect shape of A*B^T to be m x m:", torch_dot.shape
print "First few elements:\n", torch_dot[0:2, 0:2]

print "\nGPU info:"
print "torch.cuda.memory_allocated()", torch.cuda.memory_allocated()
print "torch.cuda.memory_cached()", torch.cuda.memory_cached()


Expect shape of A*B^T to be m x m: (2000, 2000)
First few elements:
[[-9.538572 23.039274]
 [ 4.491309 12.267576]]
Time: 0.339940786362 

Trying vector dot product on GPU...
Step 1: time = 0.00435900688171
Step 1: types are <type 'numpy.ndarray'> <class 'torch.Tensor'>
Step 2: time = 0.0176529884338
Step 2: type is <class 'torch.Tensor'>
Step 3: time = 0.000581026077271
Step 4: time = 0.057755947113
GPU: Expect shape of A*B^T to be m x m: torch.Size([2000, 2000])
First few elements:
tensor([[-9.5386, 23.0392],
        [ 4.4913, 12.2676]])

PyTorch non-cuda timing: time = 0.484139204025
Torch CPU: Expect shape of A*B^T to be m x m: torch.Size([2000, 2000])
First few elements:
tensor([[-9.5385, 23.0392],
        [ 4.4913, 12.2676]])

GPU info:
torch.cuda.memory_allocated() 896401408
torch.cuda.memory_cached() 1681260544

Torch Unirand vector on GPU


In [148]:
N = 1000

t0 = time.time()
b = torch.cuda.FloatTensor(N).uniform_()
c = 4*torch.mul(b, -1.0)
print c.data[0:5]
print time.time()-t0


t0 = time.time()
a = torch.rand(N, 1)
b = a.cuda()
c = torch.mul(b, -1.0)
print time.time()-t0


tensor([-0.1793, -0.4243, -0.7067, -1.2775, -1.7694], device='cuda:0')
0.00175309181213
0.000638961791992

In [ ]: