New Analysis

Goals:

  • histogram
  • distribution along all 3 planes, x, y, z
  • break into quarters and test 2 (bijan doing halfs?)
  • visualization for each of the 12 slices for density in x/y plane
  • clustering along each of the planes
  • Run all our tests on the "cleaned" data, see how they differ
  • if we find layers, or define them, run regression/tests within them.
  • Spatial point process
  • plot the cluster centers and then draw the ellipsoids formed by the diagonals off the covariance matrix around each centroid

Setting up and clearing unmasked values of 0

check shared by all, indicies from old code


In [1]:
import matplotlib.pyplot as plt
%matplotlib inline 
import numpy as np
import urllib2
import scipy.stats as stats

url = ('https://raw.githubusercontent.com/Upward-Spiral-Science/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # Remove lable row

# Clip data based on thresholds on x and y coordinates. Found from Bijan visual
x_bounds = (409, 3529)
y_bounds = (1564, 3000)

def check_in_bounds(row, x_bounds, y_bounds):
    if row[0] < x_bounds[0] or row[0] > x_bounds[1]: #check x inbounds
        return False
    if row[1] < y_bounds[0] or row[1] > y_bounds[1]: #check y inbounds
        return False
    if row[3] == 0:                                  # remove zeros of unmasked values
        return False
    return True

indices_in_bound = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_clipped = csv[indices_in_bound]
n = data_clipped.shape[0]
print 'xbound:', x_bounds, ' ybound: ', y_bounds, "n shape after clipping data: ", n


xbound: (409, 3529)  ybound:  (1564, 3000) n shape after clipping data:  32967

Histogram of syn/unmasked (density)

combining data snippit from bijan


In [2]:
density = np.divide(data_clipped[:, 4],data_clipped[:, 3])*(64**3)

data_density = np.vstack((data_clipped[:,0],data_clipped[:,1],data_clipped[:,2],density))
data_density = data_density.T

bins = 100

fig = plt.figure(1)
ax = fig.gca()
plt.hist(data_density[:,3],bins)
ax.set_title('Histogram of data_density')
ax.set_xlabel('num')

plt.show()


Checking for optimal cluster size


In [3]:
import sklearn.mixture as mixture
max_clusters = 15
bic = np.array([])
i = np.array(range(1, max_clusters))
for idx in range(1, max_clusters):
    print 'testing for cluster: ' + str(idx) 
    gmm = mixture.GMM(n_components=idx, n_iter=1000, covariance_type='diag', random_state=1)
    gmm.fit(data_density)
    bic = np.append(bic, gmm.bic(data_density))
    
fig = plt.figure(2)
ay = fig.gca()
plt.plot(i, 1.0/bic)
ay.set_title('BIC')
ay.set_ylabel('score')
ay.set_xlabel('# of clusters')


plt.show()
print 'done'


testing for cluster: 1
testing for cluster: 2
testing for cluster: 3
testing for cluster: 4
testing for cluster: 5
testing for cluster: 6
testing for cluster: 7
testing for cluster: 8
testing for cluster: 9
testing for cluster: 10
testing for cluster: 11
testing for cluster: 12
testing for cluster: 13
testing for cluster: 14
done

Plotting Gaussian


In [4]:
print 'going for Gaussian'

import matplotlib.mlab as mlab

fig = plt.figure(3)
az = fig.gca()
result = plt.hist(data_density[:,3],bins)
az.set_title('Histogram of data_density')
az.set_xlabel('num')

plt.xlim((min(data_density[:,3]), max(data_density[:,3])))

mean = np.mean(data_density[:,3])
variance = np.var(data_density[:,3])
sigma = np.sqrt(variance)
x = np.linspace(min(data_density[:,3]), max(data_density[:,3]),100)
dx = result[1][1] - result[1][0]
scale = len(data_density[:,3])*dx
plt.plot(x, mlab.normpdf(x,mean,sigma)*scale)

print


going for Gaussian

Regressions

Basic regressions


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
names = ['Linear Regression','SVR','KNN Regression','Random Forest Regression','Polynomial Regression']
regressions = [LinearRegression(),
               LinearSVR(C=1.0),
               KNN(n_neighbors=10, algorithm='auto'),
               RF(max_depth=5, max_features=1),
               Pipeline([('poly', PF(degree=2)),('linear', LinearRegression(fit_intercept=False))])]
k_fold = 10

print('Regression on X=(x,y,z), Y=syn/unmasked')
X = data_density[:, (0, 1, 2)] # x,y,z
Y = data_density[:, 3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))


Regression on X=(x,y,z), Y=syn/unmasked
R^2 of Linear Regression: 0.06 (+/- 0.15)
R^2 of SVR: -2.72 (+/- 9.81)
R^2 of KNN Regression: 0.08 (+/- 0.08)
R^2 of Random Forest Regression: 0.13 (+/- 0.12)
R^2 of Polynomial Regression: 0.11 (+/- 0.14)

Regressions of x y z with density


In [ ]:
# x 
print
print('Regressions on x and density')
X = data_density[:,[0]] # x,y,z
Y = data_density[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))

# y 
print
print('Regression on y and density')
X = data_density[:,[1]] # x,y,z
Y = data_density[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))

# z -> syn/unmasked
print
print('Regression on z and density')
X = data_density[:,[2]] # x,y,z
Y = data_density[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))

Mean and covariance of 3 and 4 clusters...

from mpl_toolkits.mplot3d import Axes3DHeavily adapted Jay's and Emily's code applied to 3 clusters, as it showed up very evenly when observing Emily's clustering visualization.


In [6]:
from mpl_toolkits.mplot3d import Axes3D

n_clusters = 3
###########################################
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
clusters = [[] for i in xrange(n_clusters)]
centroidmatrix = [0]*4
print centroidmatrix

predicted = gmm.fit_predict(data_density)
for label, row in zip(predicted, data_density[:,]):
    clusters[label].append(row)

    
for i in xrange(n_clusters):
    clusters[i] = np.array(clusters[i])
    print "# of samples in cluster %d: %d" % (i+1, len(clusters[i])) 
    print "centroid: ", np.average(clusters[i], axis=0)
    centroidmatrix = np.vstack((centroidmatrix,np.average(clusters[i], axis=0)))
    print "cluster covariance: "
    covar = np.cov(clusters[i].T)
    print covar
    print "determinant of covariance matrix: ", np.linalg.det(covar)
    print
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.scatter(
       covar[:, 0], covar[:, 1], covar[:, 2],  # data
       c='blue',  # marker colour
       marker='o',  # marker shape
       s=np.absolute(covar[:,3])/10  # marker size
    )
    plt.show

centroidmatrix = np.delete(centroidmatrix,0,0)
print centroidmatrix

fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.scatter(
       centroidmatrix[:, 0], centroidmatrix[:, 1], centroidmatrix[:, 2],  # data
       c='blue',  # marker colour
       marker='o',  # marker shape
       s=centroidmatrix[:,3]*10  # marker size
)




plt.show


[0, 0, 0, 0]
# of samples in cluster 1: 11903
centroid:  [ 1896.44879442  2549.83533563   606.29782408   219.80217679]
cluster covariance: 
[[  6.37601188e+05  -1.94515093e+03   2.48910805e+03  -1.95651532e+03]
 [ -1.94515093e+03   1.22589408e+05  -1.06825346e+02   7.90697880e+03]
 [  2.48910805e+03  -1.06825346e+02   1.40257493e+05   1.10068332e+03]
 [ -1.95651532e+03   7.90697880e+03   1.10068332e+03   6.28020427e+03]]
determinant of covariance matrix:  6.30963438168e+19

# of samples in cluster 2: 10301
centroid:  [ 2896.40568877  2133.61353267   607.3707407    364.90293327]
cluster covariance: 
[[ 151160.48967249   16963.34806281    2136.43734705    3226.41246506]
 [  16963.34806281  139431.79053141    -932.31699818   -1593.82988435]
 [   2136.43734705    -932.31699818  114693.22282925    -828.75440396]
 [   3226.41246506   -1593.82988435    -828.75440396    5276.45980125]]
determinant of covariance matrix:  1.23295044871e+19

# of samples in cluster 3: 10763
centroid:  [ 1161.63876243  2078.80507294   616.61070334   360.60027403]
cluster covariance: 
[[ 216288.60876308  -24939.31461405    2577.08905543    -263.23827   ]
 [ -24939.31461405  121208.36842994    3077.08820439   -1360.33952398]
 [   2577.08905543    3077.08820439  112476.7222493    -1721.7382217 ]
 [   -263.23827      -1360.33952398   -1721.7382217     3819.52361213]]
determinant of covariance matrix:  1.08629113105e+19

[[ 1896.44879442  2549.83533563   606.29782408   219.80217679]
 [ 2896.40568877  2133.61353267   607.3707407    364.90293327]
 [ 1161.63876243  2078.80507294   616.61070334   360.60027403]]
Out[6]:
<function matplotlib.pyplot.show>

Jay's code


In [9]:
n_clusters = 4
####################################
gmm = mixture.GMM(n_components=
                  n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
clusters = [[] for i in xrange(n_clusters)]
predicted = gmm.fit_predict(data_density)
for label, row in zip(predicted, data_density[:,]):
    clusters[label].append(row)

for i in xrange(n_clusters):
    clusters[i] = np.array(clusters[i])
    print "# of samples in cluster %d: %d" % (i+1, len(clusters[i])) 
    print "centroid: ", np.average(clusters[i], axis=0)
    print "cluster covariance: "
    covar = np.cov(clusters[i].T)
    print covar
    print "determinant of covariance matrix: ", np.linalg.det(covar)
    print


# of samples in cluster 1: 10413
centroid:  [ 2847.98501873  2298.50561798   612.96341112   315.19082516]
cluster covariance: 
[[ 168166.0595143     -365.78410715   -1288.29972223    6747.01504593]
 [   -365.78410715   98859.33931246   -1944.85135993    3117.07123641]
 [  -1288.29972223   -1944.85135993  121449.86506527    -613.80914916]
 [   6747.01504593    3117.07123641    -613.80914916   10118.62306209]]
determinant of covariance matrix:  1.96712741238e+19

# of samples in cluster 2: 6885
centroid:  [ 1885.86230937  1734.59172113   594.4583878    382.9287353 ]
cluster covariance: 
[[  7.66013849e+05  -7.17937828e+03   7.68077216e+03   2.40261595e+03]
 [ -7.17937828e+03   1.27685781e+04  -1.30569443e+03  -9.99461027e+01]
 [  7.68077216e+03  -1.30569443e+03   1.18941368e+05  -1.89973471e+03]
 [  2.40261595e+03  -9.99461027e+01  -1.89973471e+03   3.73872020e+03]]
determinant of covariance matrix:  4.27383286219e+18

# of samples in cluster 3: 11317
centroid:  [ 1214.2705664   2324.48449236   619.12167536   295.86042327]
cluster covariance: 
[[ 233306.2230051    -2110.47273909   -2683.85980646   -2622.01262681]
 [  -2110.47273909   84653.30722238   -1514.01388706    3398.71694044]
 [  -2683.85980646   -1514.01388706  123618.93084597     541.29093617]
 [  -2622.01262681    3398.71694044     541.29093617    8286.49220836]]
determinant of covariance matrix:  1.98104856259e+19

# of samples in cluster 4: 7421
centroid:  [ 1938.14580245  2952.51771998   631.56879127   215.21318891]
cluster covariance: 
[[ 777537.7124315    -3114.29436294   14489.20627599   -3414.56719744]
 [  -3114.29436294   12016.82087868    3873.60750727    -844.72482741]
 [  14489.20627599    3873.60750727  126255.04718761    9579.26807117]
 [  -3414.56719744    -844.72482741    9579.26807117   10950.22358942]]
determinant of covariance matrix:  1.17368592784e+19

Graph Cluster: Emily's


In [7]:
import sklearn.cluster as cluster
from mpl_toolkits.mplot3d import Axes3D

def graph_cluster(xyz_only, clusters, centers, k):
    # randomly sample
    samples = 10000
    perm = np.random.permutation(xrange(1, len(xyz_only[:])))
    xyz_only = xyz_only[perm[:samples]]
    clusters = clusters[perm[:samples]]

    # get range for graphing
    mins = [np.amin(xyz_only[:, i]) for i in xrange(3)]
    maxs = [np.amax(xyz_only[:, i]) for i in xrange(3)]

    # following code adopted from
    # https://www.getdatajoy.com/examples/python-plots/3d-scatter-plot
    fig = plt.figure(figsize=(10, 7))
    ax = fig.gca(projection='3d')

    ax.set_title('K-means, k='+str(k))
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')

    ax.set_xlim(mins[0], maxs[0])
    ax.set_ylim(mins[1], maxs[1])
    ax.set_zlim(mins[2], maxs[2])

    ax.view_init()
    ax.dist = 10  # distance

    ax.scatter(
               xyz_only[:, 0], xyz_only[:, 1], xyz_only[:, 2],  # data
               c=clusters,  # marker colour
               marker='o',  # marker shape
               s=50  # marker size
    )

    plt.show()
    
    fig = plt.figure(figsize=(10, 7))
    ax = fig.gca(projection='3d')
    ax.set_title('Centers, k='+str(k))
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')

    ax.set_xlim(mins[0], maxs[0])
    ax.set_ylim(mins[1], maxs[1])
    ax.set_zlim(mins[2], maxs[2])

    ax.view_init()
    ax.dist = 10  # distance

    ax.scatter(
           centers[:, 0], centers[:, 1], centers[:, 2],  # data
           c='blue',  # marker colour
           marker='o',  # marker shape
           s=100  # marker size
    )
    plt.show()

In [8]:
import sklearn.cluster as cluster
from mpl_toolkits.mplot3d import Axes3D

n_clusters = 4
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(data_density[:,[0,1,2]])
centers = kmeans.cluster_centers_
print centers
graph_cluster(data_density[:,[0,1,2]], clusters, centers, n_clusters)


[[ 1929.65968586  2650.96335079   610.        ]
 [ 3057.46220302  2277.83477322   610.        ]
 [  878.60737527  2255.17136659   610.        ]
 [ 2003.66666667  1881.34895833   610.        ]]

In [9]:
import sklearn.cluster as cluster
from mpl_toolkits.mplot3d import Axes3D

n_clusters = 3
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(data_density[:,[0,1,2]])
centers = kmeans.cluster_centers_
print centers

graph_cluster(data_density[:,[0,1,2]], clusters, centers, n_clusters)


[[ 1969.  2266.   610.]
 [ 3022.  2266.   610.]
 [  916.  2266.   610.]]

K = 5


In [11]:
n_clusters = 5
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(data_density[:,[0,1,2]])
centers = kmeans.cluster_centers_
print centers

graph_cluster(data_density[:,[0,1,2]], clusters, centers, n_clusters)


[[ 1964.22963206  2278.60709593   609.89060447]
 [  973.95933457  1886.88539741   610.        ]
 [ 2961.7         1886.95652174   606.95585284]
 [  947.96840149  2636.5          610.        ]
 [ 2983.22803207  2636.03742065   613.2079853 ]]

In [ ]:
print 'done'

In [ ]:


In [ ]: