New Analysis

Goals:

  • histogram
  • distribution along all 3 planes, x, y, z
  • break into quarters and test 2 (bijan doing halfs?)
  • visualization for each of the 12 slices for density in x/y plane
  • clustering along each of the planes
  • Run all our tests on the "cleaned" data, see how they differ
  • if we find layers, or define them, run regression/tests within them.
  • Spatial point process
  • plot the cluster centers and then draw the ellipsoids formed by the diagonals off the covariance matrix around each centroid

Setting up and clearing unmasked values of 0

check shared by all, indicies from old code


In [1]:
import matplotlib.pyplot as plt
%matplotlib inline 
import numpy as np
import urllib2
import scipy.stats as stats

url = ('https://raw.githubusercontent.com/Upward-Spiral-Science/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # Remove lable row

# Clip data based on thresholds on x and y coordinates. Found from Bijan visual
x_bounds = (409, 3529)
y_bounds = (1564, 3124)

def check_in_bounds(row, x_bounds, y_bounds):
    if row[0] < x_bounds[0] or row[0] > x_bounds[1]: #check x inbounds
        return False
    if row[1] < y_bounds[0] or row[1] > y_bounds[1]: #check y inbounds
        return False
    if row[3] == 0:                                  # remove zeros of unmasked values
        return False
    return True

indices_in_bound = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_clipped = csv[indices_in_bound]
n = data_clipped.shape[0]
print 'xbound:', x_bounds, ' ybound: ', y_bounds, "n shape after clipping data: ", n


xbound: (409, 3529)  ybound:  (1564, 3124) n shape after clipping data:  36036

Histogram of syn/unmasked (density)

combining data snippit from bijan


In [2]:
density = np.divide(data_clipped[:, 4],data_clipped[:, 3])*(64**3)

data_density = np.vstack((data_clipped[:,0],data_clipped[:,1],data_clipped[:,2],density))
data_density = data_density.T

bins = 100

fig = plt.figure(1)
ax = fig.gca()
plt.hist(data_density[:,3],bins)
ax.set_title('Histogram of data_density')
ax.set_xlabel('num')

plt.show()


Checking for optimal cluster size


In [3]:
import sklearn.mixture as mixture
max_clusters = 15
bic = np.array([])
i = np.array(range(1, max_clusters))
for idx in range(1, max_clusters):
    print 'testing for cluster: ' + str(idx) 
    gmm = mixture.GMM(n_components=idx, n_iter=1000, covariance_type='diag', random_state=1)
    gmm.fit(data_density)
    bic = np.append(bic, gmm.bic(data_density))
    
fig = plt.figure(2)
ay = fig.gca()
plt.plot(i, 1.0/bic)
ay.set_title('BIC')
ay.set_ylabel('score')
ay.set_xlabel('# of clusters')


plt.show()
print 'done'


testing for cluster: 1
testing for cluster: 2
testing for cluster: 3
testing for cluster: 4
testing for cluster: 5
testing for cluster: 6
testing for cluster: 7
testing for cluster: 8
testing for cluster: 9
testing for cluster: 10
testing for cluster: 11
testing for cluster: 12
testing for cluster: 13
testing for cluster: 14
done

Plotting Gaussian


In [4]:
print 'going for Gaussian'

import matplotlib.mlab as mlab

fig = plt.figure(3)
az = fig.gca()
result = plt.hist(data_density[:,3],bins)
az.set_title('Histogram of data_density')
az.set_xlabel('num')

plt.xlim((min(data_density[:,3]), max(data_density[:,3])))

mean = np.mean(data_density[:,3])
variance = np.var(data_density[:,3])
sigma = np.sqrt(variance)
x = np.linspace(min(data_density[:,3]), max(data_density[:,3]),100)
dx = result[1][1] - result[1][0]
scale = len(data_density[:,3])*dx
plt.plot(x, mlab.normpdf(x,mean,sigma)*scale)

print


going for Gaussian

Regressions

Basic regressions


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
names = ['Linear Regression','SVR','KNN Regression','Random Forest Regression','Polynomial Regression']
regressions = [LinearRegression(),
               LinearSVR(C=1.0),
               KNN(n_neighbors=10, algorithm='auto'),
               RF(max_depth=5, max_features=1),
               Pipeline([('poly', PF(degree=2)),('linear', LinearRegression(fit_intercept=False))])]
k_fold = 10

print('Regression on X=(x,y,z), Y=syn/unmasked')
X = data_density[:, (0, 1, 2)] # x,y,z
Y = data_density[:, 3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))


Regression on X=(x,y,z), Y=syn/unmasked
R^2 of Linear Regression: 0.12 (+/- 0.16)
R^2 of SVR: -0.86 (+/- 0.68)
R^2 of KNN Regression: 0.20 (+/- 0.08)
R^2 of Random Forest Regression: 0.21 (+/- 0.12)
R^2 of Polynomial Regression: 0.18 (+/- 0.17)

Regressions of x y z with density


In [10]:
# x 
print
print('Regressions on x and density')
X = data_density[:,[0]] # x,y,z
Y = data_density[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))

# y 
print
print('Regression on y and density')
X = data_density[:,[1]] # x,y,z
Y = data_density[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))

# z -> syn/unmasked
print
print('Regression on z and density')
X = data_density[:,[2]] # x,y,z
Y = data_density[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))


Regressions on x and density
R^2 of Linear Regression: -0.01 (+/- 0.04)
R^2 of SVR: -1.73 (+/- 6.47)
R^2 of KNN Regression: -0.17 (+/- 0.36)
R^2 of Random Forest Regression: -0.10 (+/- 0.23)
R^2 of Polynomial Regression: -0.01 (+/- 0.02)

Regression on y and density
R^2 of Linear Regression: 0.12 (+/- 0.15)
R^2 of SVR: -1.19 (+/- 2.59)
R^2 of KNN Regression: 0.00 (+/- 0.19)
R^2 of Random Forest Regression: 0.14 (+/- 0.16)
R^2 of Polynomial Regression: 0.14 (+/- 0.16)

Regression on z and density
R^2 of Linear Regression: -0.01 (+/- 0.04)
R^2 of SVR: -0.66 (+/- 2.78)
R^2 of KNN Regression: -0.08 (+/- 0.08)
R^2 of Random Forest Regression: 0.04 (+/- 0.06)
R^2 of Polynomial Regression: 0.01 (+/- 0.05)

Mean and covariance of 3 and 4 clusters... meaningless graphs :(

Heavily adapted Jay's and Emily's code applied to 3 clusters, as it showed up very evenly when observing Emily's clustering visualization.


In [17]:
n_clusters = 3
###########################################
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
clusters = [[] for i in xrange(n_clusters)]
centroidmatrix = [0]*4
print centroidmatrix

predicted = gmm.fit_predict(data_density)
for label, row in zip(predicted, data_density[:,]):
    clusters[label].append(row)

    
for i in xrange(n_clusters):
    clusters[i] = np.array(clusters[i])
    print "# of samples in cluster %d: %d" % (i+1, len(clusters[i])) 
    print "centroid: ", np.average(clusters[i], axis=0)
    centroidmatrix = np.vstack((centroidmatrix,np.average(clusters[i], axis=0)))
    print "cluster covariance: "
    covar = np.cov(clusters[i].T)
    print covar
    print "determinant of covariance matrix: ", np.linalg.det(covar)
    print
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.scatter(
       covar[:, 0], covar[:, 1], covar[:, 2],  # data
       c='blue',  # marker colour
       marker='o',  # marker shape
       s=np.absolute(covar[:,3])/10  # marker size
    )
    plt.show

print centroidmatrix

fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.scatter(
       centroidmatrix[:, 0], centroidmatrix[:, 1], centroidmatrix[:, 2],  # data
       c='blue',  # marker colour
       marker='o',  # marker shape
       s=centroidmatrix[:,3]*10  # marker size
)




plt.show


[0, 0, 0, 0]
# of samples in cluster 1: 11711
centroid:  [ 1917.13209803  2728.47186406   615.84809154   198.53145688]
cluster covariance: 
[[ 694228.44377834    3332.46934469    7326.98104205   -1012.57378398]
 [   3332.46934469  107527.12745339    4160.96306116    5206.7114939 ]
 [   7326.98104205    4160.96306116  136933.01936421    5244.09632285]
 [  -1012.57378398    5206.7114939     5244.09632285    8083.49441601]]
determinant of covariance matrix:  7.79740049612e+19

# of samples in cluster 2: 12042
centroid:  [ 2833.85301445  2154.5284006    612.52566019   353.56146845]
cluster covariance: 
[[ 177410.44297281   16206.91107624    1131.72132037    5275.82609097]
 [  16206.91107624  149135.43649091    2950.29185485    -312.71782313]
 [   1131.72132037    2950.29185485  116401.02869953    -719.86939832]
 [   5275.82609097    -312.71782313    -719.86939832    6058.23769379]]
determinant of covariance matrix:  1.79573855756e+19

# of samples in cluster 3: 12283
centroid:  [ 1155.11829358  2132.69258325   617.18431979   348.55546653]
cluster covariance: 
[[ 206754.75404001  -22816.60497667    1580.69672556    -618.26261164]
 [ -22816.60497667  140721.65666783    4509.96376743   -2056.73010699]
 [   1580.69672556    4509.96376743  115533.24529393   -1333.14608956]
 [   -618.26261164   -2056.73010699   -1333.14608956    4699.74387121]]
determinant of covariance matrix:  1.53333176368e+19

[[    0.             0.             0.             0.        ]
 [ 1917.13209803  2728.47186406   615.84809154   198.53145688]
 [ 2833.85301445  2154.5284006    612.52566019   353.56146845]
 [ 1155.11829358  2132.69258325   617.18431979   348.55546653]]
Out[17]:
<function matplotlib.pyplot.show>

Jay's code


In [ ]:
n_clusters = 4
####################################
gmm = mixture.GMM(n_components=
                  n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
clusters = [[] for i in xrange(n_clusters)]
predicted = gmm.fit_predict(data_density)
for label, row in zip(predicted, data_density[:,]):
    clusters[label].append(row)

for i in xrange(n_clusters):
    clusters[i] = np.array(clusters[i])
    print "# of samples in cluster %d: %d" % (i+1, len(clusters[i])) 
    print "centroid: ", np.average(clusters[i], axis=0)
    print "cluster covariance: "
    covar = np.cov(clusters[i].T)
    print covar
    print "determinant of covariance matrix: ", np.linalg.det(covar)
    print

Graph Cluster: Emily's


In [ ]:
import sklearn.cluster as cluster
from mpl_toolkits.mplot3d import Axes3D

def graph_cluster(xyz_only, clusters, centers, k):
    # randomly sample
    samples = 10000
    perm = np.random.permutation(xrange(1, len(xyz_only[:])))
    xyz_only = xyz_only[perm[:samples]]
    clusters = clusters[perm[:samples]]

    # get range for graphing
    mins = [np.amin(xyz_only[:, i]) for i in xrange(3)]
    maxs = [np.amax(xyz_only[:, i]) for i in xrange(3)]

    # following code adopted from
    # https://www.getdatajoy.com/examples/python-plots/3d-scatter-plot
    fig = plt.figure(figsize=(10, 7))
    ax = fig.gca(projection='3d')

    ax.set_title('K-means, k='+str(k))
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')

    ax.set_xlim(mins[0], maxs[0])
    ax.set_ylim(mins[1], maxs[1])
    ax.set_zlim(mins[2], maxs[2])

    ax.view_init()
    ax.dist = 10  # distance

    ax.scatter(
               xyz_only[:, 0], xyz_only[:, 1], xyz_only[:, 2],  # data
               c=clusters,  # marker colour
               marker='o',  # marker shape
               s=50  # marker size
    )

    plt.show()
    
    fig = plt.figure(figsize=(10, 7))
    ax = fig.gca(projection='3d')
    ax.set_title('Centers, k='+str(k))
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')

    ax.set_xlim(mins[0], maxs[0])
    ax.set_ylim(mins[1], maxs[1])
    ax.set_zlim(mins[2], maxs[2])

    ax.view_init()
    ax.dist = 10  # distance

    ax.scatter(
           centers[:, 0], centers[:, 1], centers[:, 2],  # data
           c='blue',  # marker colour
           marker='o',  # marker shape
           s=100  # marker size
    )
    plt.show()

In [ ]:
import sklearn.cluster as cluster
from mpl_toolkits.mplot3d import Axes3D

n_clusters = 4
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(data_density[:,[0,1,2]])
centers = kmeans.cluster_centers_
print centers
graph_cluster(data_density[:,[0,1,2]], clusters, centers, n_clusters)

In [ ]:
import sklearn.cluster as cluster
from mpl_toolkits.mplot3d import Axes3D

n_clusters = 3
kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(data_density[:,[0,1,2]])
centers = kmeans.cluster_centers_
print centers

graph_cluster(data_density[:,[0,1,2]], clusters, centers, n_clusters)

In [ ]:
print 'done'

In [ ]:


In [ ]: