In this experiment we use the NYC taxi dataset. We pass it through a scalar encoder, spatial pooler and then to the TM. We do a single pass to the TM and keep track of the spike trains of each cell. We use this data to estimate pairwise correlations among cells.
In [1]:
import numpy as np
import random
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pandas as pd
from nupic.encoders import ScalarEncoder
from nupic.bindings.algorithms import TemporalMemory as TM
from nupic.bindings.algorithms import SpatialPooler as SP
from htmresearch.support.neural_correlations_utils import *
random.seed(1)
In [2]:
inputSize = 109
maxItems = 17520
tmEpochs = 1
totalTS = maxItems * tmEpochs
In [3]:
# read csv file
df = pd.read_csv('nyc_taxi.csv', skiprows=[1, 2])
In [4]:
tm = TM(columnDimensions = (2048,),
cellsPerColumn=8, # We changed here the number of cells per col, initially they were 32
initialPermanence=0.21,
connectedPermanence=0.3,
minThreshold=15,
maxNewSynapseCount=40,
permanenceIncrement=0.1,
permanenceDecrement=0.1,
activationThreshold=15,
predictedSegmentDecrement=0.01
)
sparsity = 0.02
sparseCols = int(tm.numberOfColumns() * sparsity)
sp = SP(inputDimensions=(inputSize,),
columnDimensions=(2048,),
potentialRadius = int(0.5*inputSize),
numActiveColumnsPerInhArea = sparseCols,
potentialPct = 0.9,
globalInhibition = True,
synPermActiveInc = 0.0001,
synPermInactiveDec = 0.0005,
synPermConnected = 0.5,
boostStrength = 0.0,
spVerbosity = 1
)
In [5]:
rawValues = []
remainingRows = maxItems
numTrainingItems = 15000
trainSet = []
nonTrainSet = []
se = ScalarEncoder(n=109, w=29, minval=0, maxval=40000, clipInput=True)
s = 0
for index, row in df.iterrows():
if s > 0 and s % 500 == 0:
print str(s) + " items processed"
rawValues.append(row['passenger_count'])
if s < numTrainingItems:
trainSet.append(se.encode(row['passenger_count']))
else:
nonTrainSet.append(se.encode(row['passenger_count']))
remainingRows -= 1
s += 1
if remainingRows == 0:
break
print "*** All items encoded! ***"
In [6]:
allSequences = []
outputColumns = np.zeros(sp.getNumColumns(), dtype="uint32")
columnUsage = np.zeros(sp.getNumColumns(), dtype="uint32")
# Set epochs for spatial-pooling:
spEpochs = 4
for epoch in range(spEpochs):
print "Training epoch: " + str(epoch)
#randomize records in training set
randomIndex = np.random.permutation(np.arange(numTrainingItems))
for i in range(numTrainingItems):
sp.compute(trainSet[randomIndex[i]], True, outputColumns)
# Populate array for Yuwei plot:
for col in outputColumns.nonzero():
columnUsage[col] += 1
if epoch == (spEpochs - 1):
allSequences.append(outputColumns.nonzero())
for i in range(maxItems - numTrainingItems):
if i > 0 and i % 500 == 0:
print str(i) + " items processed"
sp.compute(nonTrainSet[i], False, outputColumns)
allSequences.append(outputColumns.nonzero())
# Populate array for Yuwei plot:
for col in outputColumns.nonzero():
columnUsage[col] += 1
print "*** All items processed! ***"
In [7]:
bins = 50
plt.hist(columnUsage, bins)
plt.xlabel("Number of times active")
plt.ylabel("Number of columns")
plt.savefig("columnUsage_SP")
plt.close()
In [8]:
spikeTrains = np.zeros((tm.numberOfCells(), totalTS), dtype = "uint32")
columnUsage = np.zeros(tm.numberOfColumns(), dtype="uint32")
spikeCount = np.zeros(totalTS, dtype="uint32")
ts = 0
entropyX = []
entropyY = []
negPCCX_cells = []
negPCCY_cells = []
numSpikesX = []
numSpikesY = []
numSpikes = 0
negPCCX_cols = []
negPCCY_cols = []
traceX = []
traceY = []
# Randomly generate the indices of the columns to keep track during simulation time
# keep track of 125 columns = 1000 cells
#colIndicesLarge = np.random.permutation(tm.numberOfColumns())[0:125]
for e in range(tmEpochs):
print ""
print "Epoch: " + str(e)
for s in range(maxItems):
if s % 1000 == 0:
print str(s) + " items processed"
tm.compute(allSequences[s][0].tolist(), learn=True)
for cell in tm.getActiveCells():
spikeTrains[cell, ts] = 1
numSpikes += 1
spikeCount[ts] += 1
# Obtain active columns:
activeColumnsIndices = [tm.columnForCell(i) for i in tm.getActiveCells()]
currentColumns = [1 if i in activeColumnsIndices else 0 for i in range(tm.numberOfColumns())]
for col in np.nonzero(currentColumns)[0]:
columnUsage[col] += 1
if ts > 0 and ts % int(totalTS * 0.1) == 0:
numSpikesX.append(ts)
numSpikesY.append(numSpikes)
numSpikes = 0
subSpikeTrains = subSample(spikeTrains, 1000, tm.numberOfCells(), ts, 1000)
(corrMatrix, numNegPCC) = computePWCorrelations(subSpikeTrains, removeAutoCorr=True)
negPCCX_cells.append(ts)
negPCCY_cells.append(numNegPCC)
bins = 300
plt.hist(corrMatrix.ravel(), bins, alpha=0.5)
plt.xlim(-0.1,0.2)
plt.xlabel("PCC")
plt.ylabel("Frequency")
plt.savefig("cellsHist" + str(ts))
plt.close()
traceX.append(ts)
#traceY.append(sum(1 for i in corrMatrix.ravel() if i > 0.5))
#traceY.append(np.std(corrMatrix))
#traceY.append(sum(1 for i in corrMatrix.ravel() if i > -0.05 and i < 0.1))
traceY.append(sum(1 for i in corrMatrix.ravel() if i > 0.0))
entropyX.append(ts)
entropyY.append(computeEntropy(subSpikeTrains))
#print "++ Analyzing correlations (whole columns) ++"
### First the LARGE subsample of columns:
colIndicesLarge = np.random.permutation(tm.numberOfColumns())[0:125]
subSpikeTrains = subSampleWholeColumn(spikeTrains, colIndicesLarge, tm.getCellsPerColumn(), ts, 1000)
(corrMatrix, numNegPCC) = computePWCorrelationsWithinCol(subSpikeTrains, True, tm.getCellsPerColumn())
negPCCX_cols.append(ts)
negPCCY_cols.append(numNegPCC)
#print "++ Generating histogram ++"
plt.hist(corrMatrix.ravel(), alpha=0.5)
plt.xlabel("PCC")
plt.ylabel("Frequency")
plt.savefig("colsHist_" + str(ts))
plt.close()
ts += 1
print "*** DONE ***"
# end for-epochs
In [9]:
plt.plot(traceX, traceY)
plt.xlabel("Time")
plt.ylabel("Positive PCC Count")
plt.savefig("positivePCCTrace")
plt.close()
In [ ]:
sparsityTraceX = []
sparsityTraceY = []
for i in range(totalTS - 1000):
sparsityTraceX.append(i)
sparsityTraceY.append(np.mean(spikeCount[i:1000 + i]) / tm.numberOfCells())
plt.plot(sparsityTraceX, sparsityTraceY)
plt.xlabel("Time")
plt.ylabel("Sparsity")
plt.savefig("sparsityTrace")
plt.close()
In [ ]:
# plot trace of negative PCCs
plt.plot(negPCCX_cells, negPCCY_cells)
plt.xlabel("Time")
plt.ylabel("Negative PCC Count")
plt.savefig("negPCCTrace_cells")
plt.close()
plt.plot(negPCCX_cols, negPCCY_cols)
plt.xlabel("Time")
plt.ylabel("Negative PCC Count")
plt.savefig("negPCCTrace_cols")
plt.close()
In [ ]:
# print computeEntropy()
plt.plot(entropyX, entropyY)
plt.xlabel("Time")
plt.ylabel("Entropy")
plt.savefig("entropyTM")
plt.close()
In [ ]:
bins = 50
plt.hist(columnUsage, bins)
plt.xlabel("Number of times active")
plt.ylabel("Number of columns")
plt.savefig("columnUsage_TM")
plt.close()
In [ ]:
plt.plot(numSpikesX, numSpikesY)
plt.xlabel("Time")
plt.ylabel("Num Spikes")
plt.savefig("numSpikesTrace")
plt.close()
In [ ]:
simpleAccuracyTest("periodic", tm, allSequences)
In [ ]:
subSpikeTrains = subSample(spikeTrains, 1000, tm.numberOfCells(), 0, 0)
In [ ]:
isi = computeISI(subSpikeTrains)
In [ ]:
#bins = np.linspace(np.min(isi), np.max(isi), 50)
bins = 100
plt.hist(isi, bins)
# plt.xlim(0,4000)
# plt.xlim(89500,92000)
plt.xlabel("ISI")
plt.ylabel("Frequency")
plt.savefig("isiTM")
plt.close()
In [ ]:
print np.mean(isi)
print np.std(isi)
print np.std(isi)/np.mean(isi)
In [ ]:
subSpikeTrains = subSample(spikeTrains, 100, tm.numberOfCells(), -1, 1000)
In [ ]:
rasterPlot(subSpikeTrains, "TM")
In [ ]:
saveTM(tm)
In [ ]:
# to load the TM back from the file do:
with open('tm.nta', 'rb') as f:
proto2 = TemporalMemoryProto_capnp.TemporalMemoryProto.read(f, traversal_limit_in_words=2**61)
tm = TM.read(proto2)
In [ ]:
overlapMatrix = inputAnalysis(allSequences, "periodic", tm.numberOfColumns())
In [ ]:
# show heatmap of overlap matrix
plt.imshow(overlapMatrix, cmap='spectral', interpolation='nearest')
cb = plt.colorbar()
cb.set_label('Overlap Score')
plt.savefig("overlapScore_heatmap")
plt.close()
# plt.show()
# generate histogram
bins = 60
(n, bins, patches) = plt.hist(overlapMatrix.ravel(), bins, alpha=0.5)
plt.xlabel("Overlap Score")
plt.ylabel("Frequency")
plt.savefig("overlapScore_hist")
plt.xlim(0.2,1)
plt.ylim(0,1000000)
plt.xlabel("Overlap Score")
plt.ylabel("Frequency")
plt.savefig("overlapScore_hist_ZOOM")
plt.close()
In [ ]: