Checking Assumptions


In [1]:
import itertools
import csv
import numpy as np
from scipy import linalg
from scipy.stats import cumfreq
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture
from sklearn import cluster
from rpy2 import robjects as ro
from rpy2.robjects.packages import importr
mvn = importr('MVN')
%matplotlib inline
np.random.seed(1)


/Library/Python/2.7/site-packages/rpy2/robjects/packages.py:63: UserWarning: Note: the specification for S3 class “family” in package ‘MatrixModels’ seems equivalent to one from package ‘lme4’: not turning on duplicate class definitions for this class.

  return _reval(expr)

Load data


In [2]:
X = np.load('SynapseFeatures.npy')
print 'data loaded'


data loaded

Cluster data into 17 clusters using k-means


In [3]:
kmeans = cluster.MiniBatchKMeans(n_clusters=17)
ClusterIdx = kmeans.fit_predict(X)
print 'k-means complete'


k-means complete

In [4]:
from rpy2.robjects.packages import importr
mvn = importr('MVN')

In [ ]:
ro.r.assign('ClusterIdx',ClusterIdx.tolist())
ro.r('ClusterIdx <- unlist(ClusterIdx)')
ro.r.assign('Xr',ro.FloatVector(np.ravel(X)))
nr = X.shape[0]
ro.r.assign('nr',nr)
ro.r('Xr <- matrix(Xr, nrow = nr, byrow = TRUE)')
ro.r('''
     for (i in seq(1,max(ClusterIdx)))
     {pdf(paste("qq_plot_",i,".pdf"))
     qqnorm(Xr[ClusterIdx==i,1:ncol(Xr)], main = "Normal Q-Q Plot",
         xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
         plot.it = TRUE)
     qqline(Xr[ClusterIdx==i,1:ncol(Xr)])
     dev.off()
     hzTestResult <- hzTest(Xr, qqplot = FALSE)
     print(hzTestResult)
     }
     ''')
#ro.r('pdf("qq_plot.pdf")')
#ro.r('''qqnorm(Xr, main = "Normal Q-Q Plot",
#   xlab = "Theoretical Quantiles", ylab = "Sample Quantiles",
#   plot.it = TRUE)''')
#ro.r('''qqline(Xr)''')
#ro.r('dev.off()')

In [8]:
ro.r('hzTestResult <- hzTest(Xr, qqplot = FALSE)')
hzTestStat = ro.r('hzTestResult')

In [11]:
ro.r('print(hzTestResult)')


  Henze-Zirkler's Multivariate Normality Test 
--------------------------------------------- 
  data : Xr 

  HZ      : 1 
  p-value : 0 

  Result  : Data are not multivariate normal. 
--------------------------------------------- 

Out[11]:
<RS4 - Python:0x10d022638 / R:0x7fc67ef02db8>

In [19]:
ro.r('print(length(ClusterIdx))')
ro.r('print(nrow(Xr))')


[1] 100000
[1] 8897
Out[19]:
<IntVector - Python:0x10d020ab8 / R:0x7fc67da26aa8>
[    8897]