In [3]:
import numpy as np
#from counting_grid import CountingGrid
In [41]:
class CountingGrid():
def __init__(self, size, window_size, nr_of_features):
'''
size-- a two-dimensional numpy array, indicating the size of the
Counting Grid
window_size-- a two-dimensional numpy array, indicating the size
of the window
'''
np.random.seed(7)
self.size = size
self.window_size = window_size
self.nr_of_features = nr_of_features # No. of features
#Initialize arrays here for pi, h
#pi is a 3D array, storing the probability distributions
#that collectively characterize the data. The first dimensionality
#corresponds to features, e.g. it is Z. The second and third
#dimension correspond to the size of the grid in x and y directions.
rand_init_pi = 1 + np.random.rand(self.nr_of_features,self.size[0],self.size[1])
self.pi = rand_init_pi/sum(rand_init_pi,0)
#Test pi
#self.pi = np.array([[[0.494, 0.524],[0.479,0.418]],[[0.506, 0.476],[0.521, 0.582]]])
self.h = np.zeros((self.nr_of_features,self.size[0],self.size[1]))
#self.compute_histograms()
def normalize_data(self,X):
X=X.transpose()
normalized_X = np.prod(self.window_size)*np.divide(X.astype('float'),np.sum(X,0))
normalized_X=normalized_X.transpose()
return normalized_X
def compute_sum_in_a_window(self,grid,k,l):
cumsum1=np.sum(np.sum(grid[:,:k+self.window_size[0],:l+self.window_size[1]],axis=1),axis=1)
cumsum2=np.sum(np.sum(grid[:,:k,:l+self.window_size[1]],axis=1),axis=1)
cumsum3=np.sum(np.sum(grid[:,:k+self.window_size[0],:l],axis=1),axis=1)
cumsum4=np.sum(np.sum(grid[:,:k,:l],axis=1),axis=1)
cumsum=cumsum1-cumsum2-cumsum3+cumsum4
'''
cumsum1 = grid[self.window_size[0]-1:,self.window_size[1]-1:]
cumsum2 = grid[:grid.shape[0]-self.window_size[0]+1,self.window_size[1]-1:]
cumsum3 = grid[self.window_size[0]-1:,:grid.shape[1]-self.window_size[1]+1]
cumsum4 = grid[:grid.shape[0]-self.window_size[0]+1,:grid.shape[1]-self.window_size[1]+1]
#print cumsum1.shape, cumsum2.shape,cumsum3.shape
cumsum = cumsum1+cumsum2+cumsum3+cumsum4
cumsum = cumsum[:cumsum.shape[1]-1,:cumsum.shape[1]-1]
'''
return cumsum
def compute_histograms(self):
'''
Histograms at each point in the grid are computed
by averaging the distributions pi in a pre-defined
window. The left upmost corner of the window is placed
on the grid position and the distributions are averaged.
'''
for k in range(0,self.size[0]):
for l in range(0,self.size[1]):
self.h[:,k,l]=self.compute_sum_in_a_window(self.pi,k,l)
self.h=self.h/np.prod(self.window_size)
print(self.h)
def e_step(self,X):
'''
q is a 3D array with shape q.shape=(z_dimension=
nr_of_samples,x and y=grid_size).
It stores the probabilities of a sample mapping to a
window in location k=[i1,i2]
h is a 3D array with shape h.shape(z_dimension=
nr_of_features, x and y=grid_size).
h describes the histograms (spanning along the first axis) from
which samples are drawn, in each location on the grid k=[i1,i2]
'''
nr_of_samples = X.shape[0]
#Determine a minimal considered probability, for numerical purposes
min_numeric_probability = float(1)/(10*self.size[0]*self.size[1])
#Initialize q
q_size=(nr_of_samples,self.size[0],self.size[1])
self.q = np.zeros(q_size)
self.q = np.exp(np.tensordot(X,np.log(self.h),axes=(1,0)))
self.q[self.q<min_numeric_probability]=min_numeric_probability
for t in range(0,nr_of_samples):
normalizer=np.sum(self.q[t,:,:])
self.q[t,:,:]= self.q[t,:,:]/normalizer
print(self.q)
def update_pi(self,X):
'''
Updating the distributions pi on the grid involves
calculations on data, distributions of mappings of
data on the grid log_q and the histograms on each
grid point.
'''
#padded_q=np.lib.pad(self.q, ((0,0),(0,self.window_size[0]),(0,self.window_size[1])),'wrap')
#padded_h=np.lib.pad(self.h, ((0,0),(0,self.window_size[0]),(0,self.window_size[1])),'wrap')
new_pi=np.zeros([self.nr_of_features,self.size[0],self.size[1]])
for i1 in range(0, self.size[0]):
for i2 in range(0,self.size[1]):
for z in range(0,X.shape[1]):
t_storage=[]
for t in range(0,X.shape[0]):
window_sum=self.compute_sum_in_a_window(np.divide(self.q[t,:,:],self.h[z,:,:].reshape(1,self.size[0],self.size[1])),i1,i2)
#print(np.divide(self.q[t,:,:],self.h).shape)
interm= X[t,z]*window_sum
t_storage.append(interm)
new_pi[z,i1,i2]=self.pi[z,i1,i2]*sum(t_storage)
self.pi=new_pi
normalizer=np.sum(self.pi,0)
self.pi=np.divide(self.pi,normalizer)
def m_step(self,X):
self.update_pi(X)
self.update_h()
def fit(self,X,max_iteration,y=None):
'''
This is a function for fitting the counting
grid using variational Expectation Maximization.
The data dimensionality is nr_of_samples on first axis,
and nr_of_features on second axis.
X= [nr_of_samples, nr_of_features]
'''
X=self.normalize_data(X)
for i in range(0,max_iteration):
self.e_step(X)
self.m_step(X)
return self.pi, self.q
def cg_plot(self,labels):
'''Currently supports 5 different symbols,
the labels have to be numbers between 0-4
for the code to work.
'''
lab = np.unique(labels)
L = len(lab)
for i in range(0,L):
ids = np.where(labels==lab[i])[0]
if i==0:
marker='o'
if i==1:
marker='v'
if i==2:
marker='^'
if i==3:
marker='*'
if i==4:
marker='+'
for t in range(0,len(ids)):
temp = self.q[ids[t],:,:]
x,y = np.unravel_index(temp.argmax(), temp.shape)
noise = 0.2*np.random.rand(1)
plt.scatter(x+noise,y+noise, marker=marker,s=60,color=cm.rainbow(i*100))
plt.show()
class CountingGridTest():
def test_h_computation(self):
self.pi
return 0
def test_e_step(self):
return 0
def test_m_step(self):
return 0
def test_arr_sum(self):
return 0
In [ ]:
from scipy import io
def filter_by_variance(X,nr_of_features_to_keep):
#Function for thresholding data by variance,
#keeping only 'nr_of_features_to_keep' features
#with the highest variance.
#X=[nr_of_samples, nr_of_features]
ordering = np.argsort(np.var(X,axis=0))[::-1]
threshold = ordering[0:nr_of_features_to_keep]
X=X[:,threshold]
return X
data = io.loadmat('/home/maria/Documents/CountingGrids/lung_bhattacherjee.mat')
X= data['data']
Y_labels = data['sample_names'][0]
X = filter_by_variance(X,10)
print(X)
X=np.exp(X)
cg_obj=CountingGrid(np.array([15,15]),np.array([3,3]),10)
cg_obj.fit(X,10)
In [34]:
cg_obj=CountingGrid(np.array([5,5]),np.array([2,2]),3)
cg_obj.compute_histograms()
X=np.array([[1,2,3],[3,4,5]])
cg_obj.e_step(X)
cg_obj.update_pi(X)
print(cg_obj.pi)
In [9]:
cg_obj=CountingGrid(np.array([5,5]),np.array([2,2]),3)
#X has dimensions different from the counts in
#MATLAB implementation-- one row is one data vector,
#that's how it is in sci-kit learn
X=np.array([[1,2,3],[3,4,5]])
print(cg_obj.compute_sum_in_a_window(cg_obj.pi,4,2))
print(cg_obj.pi)
#np.sum(cg_obj.pi[0,:2,:2])
print(np.sum(cg_obj.pi[2,1:3,1:3]))