EXP 3-NYCtaxi

In this experiment we use the NYC taxi dataset. We pass it through a scalar encoder, spatial pooler and then to the TM. We do a single pass to the TM and keep track of the spike trains of each cell. We use this data to estimate pairwise correlations among cells.


In [1]:
import numpy as np
import random
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd

from nupic.encoders import ScalarEncoder
from nupic.bindings.algorithms import TemporalMemory as TM
from nupic.bindings.algorithms import SpatialPooler as SP
from htmresearch.support.neural_correlations_utils import *

random.seed(1)

In [2]:
inputSize = 109
maxItems = 17520
tmEpochs = 1
totalTS = maxItems * tmEpochs

In [3]:
# read csv file
df = pd.read_csv('nyc_taxi.csv', skiprows=[1, 2])

In [4]:
tm = TM(columnDimensions = (2048,),
        cellsPerColumn=8, # We changed here the number of cells per col, initially they were 32
        initialPermanence=0.21,
        connectedPermanence=0.3,
        minThreshold=15,
        maxNewSynapseCount=40,
        permanenceIncrement=0.1,
        permanenceDecrement=0.1,
        activationThreshold=15,
        predictedSegmentDecrement=0.01
       )

sparsity = 0.02
sparseCols = int(tm.numberOfColumns() * sparsity)

sp = SP(inputDimensions=(inputSize,),
        columnDimensions=(2048,),
        potentialRadius = int(0.5*inputSize),
        numActiveColumnsPerInhArea = sparseCols,
        potentialPct = 0.9,
        globalInhibition = True,
        synPermActiveInc = 0.0001,
        synPermInactiveDec = 0.0005,
        synPermConnected = 0.5,
        boostStrength = 0.0,
        spVerbosity = 1
       )

Part I. Encoder


In [5]:
rawValues = []
remainingRows = maxItems
numTrainingItems = 15000
trainSet = []
nonTrainSet = []

se = ScalarEncoder(n=109, w=29, minval=0, maxval=40000, clipInput=True)
s = 0

for index, row in df.iterrows():
    if s > 0 and s % 500 == 0:
        print str(s) + " items processed"
        
    rawValues.append(row['passenger_count'])
    
    if s < numTrainingItems:
        trainSet.append(se.encode(row['passenger_count']))
    else:
        nonTrainSet.append(se.encode(row['passenger_count']))
        
    remainingRows -= 1
    s += 1
    if remainingRows == 0: 
        break
print "*** All items encoded! ***"


500 items processed
1000 items processed
1500 items processed
2000 items processed
2500 items processed
3000 items processed
3500 items processed
4000 items processed
4500 items processed
5000 items processed
5500 items processed
6000 items processed
6500 items processed
7000 items processed
7500 items processed
8000 items processed
8500 items processed
9000 items processed
9500 items processed
10000 items processed
10500 items processed
11000 items processed
11500 items processed
12000 items processed
12500 items processed
13000 items processed
13500 items processed
14000 items processed
14500 items processed
15000 items processed
15500 items processed
16000 items processed
16500 items processed
17000 items processed
17500 items processed
*** All items encoded! ***

Part II. Spatial Pooler


In [6]:
allSequences = []
outputColumns = np.zeros(sp.getNumColumns(), dtype="uint32")
columnUsage = np.zeros(sp.getNumColumns(), dtype="uint32")

# Set epochs for spatial-pooling:
spEpochs = 4

for epoch in range(spEpochs):
    print "Training epoch: " + str(epoch)
    
    #randomize records in training set
    randomIndex = np.random.permutation(np.arange(numTrainingItems))
    
    for i in range(numTrainingItems):
        sp.compute(trainSet[randomIndex[i]], True, outputColumns)
        # Populate array for Yuwei plot:
        for col in outputColumns.nonzero():
            columnUsage[col] += 1                        
        if epoch == (spEpochs - 1):
            allSequences.append(outputColumns.nonzero()) 

for i in range(maxItems - numTrainingItems):
    if i > 0 and i % 500 == 0:
        print str(i) + " items processed"    
    sp.compute(nonTrainSet[i], False, outputColumns)
    allSequences.append(outputColumns.nonzero())
    # Populate array for Yuwei plot:
    for col in outputColumns.nonzero():
        columnUsage[col] += 1                

print "*** All items processed! ***"


Training epoch: 0
Training epoch: 1
Training epoch: 2
Training epoch: 3
500 items processed
1000 items processed
1500 items processed
2000 items processed
2500 items processed
*** All items processed! ***

In [7]:
bins = 50
plt.hist(columnUsage, bins)
plt.xlabel("Number of times active")
plt.ylabel("Number of columns")
plt.savefig("columnUsage_SP")
plt.close()

Part III. Temporal Memory


In [8]:
spikeTrains = np.zeros((tm.numberOfCells(), totalTS), dtype = "uint32")
columnUsage = np.zeros(tm.numberOfColumns(), dtype="uint32")
spikeCount = np.zeros(totalTS, dtype="uint32")
ts = 0

entropyX = []
entropyY = []

negPCCX_cells = []
negPCCY_cells = []

numSpikesX = []
numSpikesY = []

numSpikes = 0

negPCCX_cols = []
negPCCY_cols = []

traceX = []
traceY = []

# Randomly generate the indices of the columns to keep track during simulation time
# keep track of 125 columns = 1000 cells
#colIndicesLarge = np.random.permutation(tm.numberOfColumns())[0:125] 

for e in range(tmEpochs):
    print ""
    print "Epoch: " + str(e)
    for s in range(maxItems):
        if s % 1000 == 0:
            print str(s) + " items processed"
        
        tm.compute(allSequences[s][0].tolist(), learn=True)
        for cell in tm.getActiveCells():
            spikeTrains[cell, ts] = 1 
            numSpikes += 1
            spikeCount[ts] += 1
            
        # Obtain active columns:
        activeColumnsIndices = [tm.columnForCell(i) for i in tm.getActiveCells()]
        currentColumns = [1 if i in activeColumnsIndices else 0 for i in range(tm.numberOfColumns())]
        for col in np.nonzero(currentColumns)[0]:
            columnUsage[col] += 1                
    
        if ts > 0 and ts % int(totalTS * 0.1) == 0:
            numSpikesX.append(ts)
            numSpikesY.append(numSpikes)
            
            numSpikes = 0            
            subSpikeTrains = subSample(spikeTrains, 1000, tm.numberOfCells(), ts, 1000)
            (corrMatrix, numNegPCC) = computePWCorrelations(subSpikeTrains, removeAutoCorr=True)
            negPCCX_cells.append(ts)
            negPCCY_cells.append(numNegPCC)                
            bins = 300
            plt.hist(corrMatrix.ravel(), bins, alpha=0.5)                
            plt.xlim(-0.1,0.2)
            plt.xlabel("PCC")
            plt.ylabel("Frequency")
            plt.savefig("cellsHist" + str(ts))
            plt.close()
            
            traceX.append(ts)
            #traceY.append(sum(1 for i in corrMatrix.ravel() if i > 0.5))
            #traceY.append(np.std(corrMatrix))
            #traceY.append(sum(1 for i in corrMatrix.ravel() if i > -0.05 and i < 0.1))
            traceY.append(sum(1 for i in corrMatrix.ravel() if i > 0.0))
            
            entropyX.append(ts)
            entropyY.append(computeEntropy(subSpikeTrains))

            #print "++ Analyzing correlations (whole columns) ++"
            ### First the LARGE subsample of columns:
            colIndicesLarge = np.random.permutation(tm.numberOfColumns())[0:125] 
            
            subSpikeTrains = subSampleWholeColumn(spikeTrains, colIndicesLarge, tm.getCellsPerColumn(), ts, 1000)
            (corrMatrix, numNegPCC) = computePWCorrelationsWithinCol(subSpikeTrains, True, tm.getCellsPerColumn())
            negPCCX_cols.append(ts)
            negPCCY_cols.append(numNegPCC)                
            #print "++ Generating histogram ++"
            plt.hist(corrMatrix.ravel(), alpha=0.5)
            plt.xlabel("PCC")
            plt.ylabel("Frequency")
            plt.savefig("colsHist_" + str(ts))
            plt.close()                 

        ts += 1                
        
print "*** DONE ***"
# end for-epochs


Epoch: 0
0 items processed
1000 items processed
2000 items processed
3000 items processed
4000 items processed
5000 items processed
6000 items processed
7000 items processed
8000 items processed
9000 items processed
10000 items processed
11000 items processed
12000 items processed
13000 items processed
14000 items processed
15000 items processed
16000 items processed
17000 items processed
*** DONE ***

In [9]:
plt.plot(traceX, traceY)
plt.xlabel("Time")
plt.ylabel("Positive PCC Count")
plt.savefig("positivePCCTrace")
plt.close()

In [ ]:
sparsityTraceX = []
sparsityTraceY = []
for i in range(totalTS - 1000):
    sparsityTraceX.append(i)
    sparsityTraceY.append(np.mean(spikeCount[i:1000 + i]) / tm.numberOfCells())

plt.plot(sparsityTraceX, sparsityTraceY)
plt.xlabel("Time")
plt.ylabel("Sparsity")
plt.savefig("sparsityTrace")
plt.close()

In [ ]:
# plot trace of negative PCCs
plt.plot(negPCCX_cells, negPCCY_cells)
plt.xlabel("Time")
plt.ylabel("Negative PCC Count")
plt.savefig("negPCCTrace_cells")
plt.close()

plt.plot(negPCCX_cols, negPCCY_cols)
plt.xlabel("Time")
plt.ylabel("Negative PCC Count")
plt.savefig("negPCCTrace_cols")
plt.close()

In [ ]:
# print computeEntropy()
plt.plot(entropyX, entropyY)
plt.xlabel("Time")
plt.ylabel("Entropy")
plt.savefig("entropyTM")
plt.close()

In [ ]:
bins = 50
plt.hist(columnUsage, bins)
plt.xlabel("Number of times active")
plt.ylabel("Number of columns")
plt.savefig("columnUsage_TM")
plt.close()

In [ ]:
plt.plot(numSpikesX, numSpikesY)
plt.xlabel("Time")
plt.ylabel("Num Spikes")
plt.savefig("numSpikesTrace")
plt.close()

Part IV. Analysis of Spike Trains


In [ ]:
simpleAccuracyTest("periodic", tm, allSequences)

In [ ]:
subSpikeTrains = subSample(spikeTrains, 1000, tm.numberOfCells(), 0, 0)

In [ ]:
isi = computeISI(subSpikeTrains)

In [ ]:
#bins = np.linspace(np.min(isi), np.max(isi), 50)
bins = 100
plt.hist(isi, bins)
# plt.xlim(0,4000)
# plt.xlim(89500,92000)
plt.xlabel("ISI")
plt.ylabel("Frequency")
plt.savefig("isiTM")
plt.close()

In [ ]:
print np.mean(isi)
print np.std(isi)
print np.std(isi)/np.mean(isi)

Raster plots


In [ ]:
subSpikeTrains = subSample(spikeTrains, 100, tm.numberOfCells(), -1, 1000)

In [ ]:
rasterPlot(subSpikeTrains, "TM")

Part V. Save TM


In [ ]:
saveTM(tm)

In [ ]:
# to load the TM back from the file do:
with open('tm.nta', 'rb') as f:
    proto2 = TemporalMemoryProto_capnp.TemporalMemoryProto.read(f, traversal_limit_in_words=2**61)
tm = TM.read(proto2)

Part VI. Analysis of Input


In [ ]:
overlapMatrix = inputAnalysis(allSequences, "periodic", tm.numberOfColumns())

In [ ]:
# show heatmap of overlap matrix
plt.imshow(overlapMatrix, cmap='spectral', interpolation='nearest')
cb = plt.colorbar()
cb.set_label('Overlap Score')
plt.savefig("overlapScore_heatmap")
plt.close()
# plt.show()

# generate histogram
bins = 60
(n, bins, patches) = plt.hist(overlapMatrix.ravel(), bins, alpha=0.5)

plt.xlabel("Overlap Score")
plt.ylabel("Frequency")
plt.savefig("overlapScore_hist")

plt.xlim(0.2,1)
plt.ylim(0,1000000)
plt.xlabel("Overlap Score")
plt.ylabel("Frequency")
plt.savefig("overlapScore_hist_ZOOM")
plt.close()

In [ ]: