In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import urllib2
import scipy.stats as stats
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # Remove lable row
# Clip data based on thresholds on x and y coordinates. Found from Bijan visual
x_bounds = (409, 3529)
y_bounds = (1564, 3124)
def check_in_bounds(row, x_bounds, y_bounds):
if row[0] < x_bounds[0] or row[0] > x_bounds[1]: #check x inbounds
return False
if row[1] < y_bounds[0] or row[1] > y_bounds[1]: #check y inbounds
return False
if row[3] == 0: # remove zeros of unmasked values
return False
return True
indices_in_bound = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_clipped = csv[indices_in_bound]
density = np.divide(data_clipped[:, 4],data_clipped[:, 3])*(64**3)
data_density = np.vstack((data_clipped[:,0],data_clipped[:,1],data_clipped[:,2],density))
data_density = data_density.T
print 'End removing zero unmasked and removing image edges'
In [ ]:
Borders of clusters from visual estimate of previous homework cluster visuals (emily)
In [14]:
#edges of k-means, k = 3 clusters
cluster3x1bounds = (409,1500)
cluster3x2bounds = (1501,2500)
cluster3x3bounds = (2501,3529)
#edges of k-means, k = 4 clusters
cluster4x1bounds = (409,1100)
cluster4x4bounds = (2750,3529)
def check_in_cluster(row, x_bounds):
if row[0] < x_bounds[0] or row[0] > x_bounds[1]: #check x inbounds
return False
return True
indices_in_bound = np.where(np.apply_along_axis(check_in_cluster, 1, data_density, cluster3x1bounds))
data_k3_1 = data_density[indices_in_bound]
indices_in_bound = np.where(np.apply_along_axis(check_in_cluster, 1, data_density, cluster3x2bounds))
data_k3_2 = data_density[indices_in_bound]
indices_in_bound = np.where(np.apply_along_axis(check_in_cluster, 1, data_density, cluster3x3bounds))
data_k3_3 = data_density[indices_in_bound]
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
names = ['Linear Regression','SVR','KNN Regression','Random Forest Regression','Polynomial Regression']
regressions = [LinearRegression(),
LinearSVR(C=1.0),
KNN(n_neighbors=10, algorithm='auto'),
RF(max_depth=5, max_features=1),
Pipeline([('poly', PF(degree=2)),('linear', LinearRegression(fit_intercept=False))])]
k_fold = 10
In [20]:
print('Regression on X=(x,y,z), Y=syn/unmasked')
X = data_k3_1[:, (0, 1, 2)] # x,y,z
Y = data_k3_1[:, 3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# x
print
print('Regressions on x and density')
X = data_k3_1[:,[0]] # x,y,z
Y = data_k3_1[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# y
print
print('Regression on y and density')
X = data_k3_1[:,[1]] # x,y,z
Y = data_k3_1[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# z -> syn/unmasked
print
print('Regression on z and density')
X = data_k3_1[:,[2]] # x,y,z
Y = data_k3_1[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
In [19]:
print('Regression on X=(x,y,z), Y=syn/unmasked')
X = data_k3_2[:, (0, 1, 2)] # x,y,z
Y = data_k3_2[:, 3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# x
print
print('Regressions on x and density')
X = data_k3_2[:,[0]] # x,y,z
Y = data_k3_2[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# y
print
print('Regression on y and density')
X = data_k3_2[:,[1]] # x,y,z
Y = data_k3_2[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# z -> syn/unmasked
print
print('Regression on z and density')
X = data_k3_2[:,[2]] # x,y,z
Y = data_k3_2[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
In [21]:
print('Regression on X=(x,y,z), Y=syn/unmasked')
X = data_k3_3[:, (0, 1, 2)] # x,y,z
Y = data_k3_3[:, 3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# x
print
print('Regressions on x and density')
X = data_k3_3[:,[0]] # x,y,z
Y = data_k3_3[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# y
print
print('Regression on y and density')
X = data_k3_3[:,[1]] # x,y,z
Y = data_k3_3[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# z -> syn/unmasked
print
print('Regression on z and density')
X = data_k3_3[:,[2]] # x,y,z
Y = data_k3_3[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
In [22]:
#edges of k-means, k = 4 clusters, Not sure how to get full clusters
cluster4x1bounds = (409,1100)
cluster4x4bounds = (2750,3529)
def check_in_cluster(row, x_bounds):
if row[0] < x_bounds[0] or row[0] > x_bounds[1]: #check x inbounds
return False
return True
indices_in_bound = np.where(np.apply_along_axis(check_in_cluster, 1, data_density, cluster4x1bounds))
data_k4_1 = data_density[indices_in_bound]
indices_in_bound = np.where(np.apply_along_axis(check_in_cluster, 1, data_density, cluster4x4bounds))
data_k4_4 = data_density[indices_in_bound]
In [25]:
print('Regression on X=(x,y,z), Y=syn/unmasked')
X = data_k4_1[:, (0, 1, 2)] # x,y,z
Y = data_k4_1[:, 3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# x
print
print('Regressions on x and density')
X = data_k4_1[:,[0]] # x,y,z
Y = data_k4_1[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# y
print
print('Regression on y and density')
X = data_k4_1[:,[1]] # x,y,z
Y = data_k4_1[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# z -> syn/unmasked
print
print('Regression on z and density')
X = data_k4_1[:,[2]] # x,y,z
Y = data_k4_1[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
In [26]:
print('Regression on X=(x,y,z), Y=syn/unmasked')
X = data_k4_4[:, (0, 1, 2)] # x,y,z
Y = data_k4_4[:, 3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# x
print
print('Regressions on x and density')
X = data_k4_4[:,[0]] # x,y,z
Y = data_k4_4[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# y
print
print('Regression on y and density')
X = data_k4_4[:,[1]] # x,y,z
Y = data_k4_4[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
# z -> syn/unmasked
print
print('Regression on z and density')
X = data_k4_4[:,[2]] # x,y,z
Y = data_k4_4[:,3] # syn/unmasked
for idx2, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
print("R^2 of %s: \t %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
Focusing on the centroid, we can see that the centroid centers are not near where the centers are when calcualated with kmeans.
In [36]:
import sklearn.mixture as mixture
n_clusters = 4
###########################################
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag', random_state=1)
clusters = [[] for i in xrange(n_clusters)]
centroidmatrix = [0]*4
print centroidmatrix
predicted = gmm.fit_predict(data_density)
for label, row in zip(predicted, data_density[:,]):
clusters[label].append(row)
for i in xrange(n_clusters):
clusters[i] = np.array(clusters[i])
print "# of samples in cluster %d: %d" % (i+1, len(clusters[i]))
print "centroid: ", np.average(clusters[i], axis=0)
centroidmatrix = np.vstack((centroidmatrix,np.average(clusters[i], axis=0)))
# print "cluster covariance: "
covar = np.cov(clusters[i].T)
# print covar
print "determinant of covariance matrix: ", np.linalg.det(covar)
print
centroidmatrix = np.delete(centroidmatrix, (0), axis = 0)
print centroidmatrix
fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.view_init()
ax.dist = 10 # distance
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
ax.set_title('Scatter Plot of Centroids, size weighted by density')
ax.set_xticks(np.arange(500, 3500, 500))
ax.set_yticks(np.arange(1200,3200, 500))
ax.set_zticks(np.arange(0,1200, 150))
ax.scatter(
centroidmatrix[:, 0], centroidmatrix[:, 1], centroidmatrix[:, 2], # data
c='blue', # marker colour
marker='o', # marker shape
s=centroidmatrix[:,3]*10 # marker size
)
plt.show
Out[36]:
hexbin for bottom 4 levels... doesn't seem to be anything interesting... only to realize bins may be too small and are just points (w/o density data)... >.<
In [44]:
from mpl_toolkits.mplot3d import Axes3D
# Random Sample
samples = 20000
perm = np.random.permutation(xrange(1, len(data_density[:])))
data_density_sample = data_density[perm[:samples]]
data_uniques, UIndex, UCounts = np.unique(data_density_sample[:,2], return_index = True, return_counts = True)
print 'uniques'
print 'index: ' + str(UIndex)
print 'counts: ' + str(UCounts)
print 'values: ' + str(data_uniques)
xmin = data_density_sample[:,0].min()
xmax = data_density_sample[:,0].max()
ymin = data_density_sample[:,1].min()
ymax = data_density_sample[:,1].max()
def check_z(row):
if row[2] == 55:
return True
return False
index_true = np.where(np.apply_along_axis(check_z, 1, data_density_sample))
dds55 = data_density_sample[index_true]
data_uniques, UIndex, UCounts = np.unique(dds55[:,2], return_index = True, return_counts = True)
print 'uniques check'
print 'index: ' + str(UIndex)
print 'counts: ' + str(UCounts)
print 'values: ' + str(data_uniques)
#plt.subplots_adjust(hspace=1)
#plt.subplot(121)
plt.hexbin(dds55[:,0], dds55[:,1], cmap=plt.cm.YlOrRd_r,mincnt=1)
plt.axis([xmin, xmax, ymin, ymax])
plt.title("Hexagon binning")
plt.xlabel('x coordinates')
plt.ylabel('y coordinates')
cb = plt.colorbar()
cb.set_label('density')
plt.show()
def check_z(row):
if row[2] == 166:
return True
return False
index_true = np.where(np.apply_along_axis(check_z, 1, data_density_sample))
ddsZ = data_density_sample[index_true]
plt.hexbin(ddsZ[:,0], ddsZ[:,1], cmap=plt.cm.YlOrRd_r,mincnt=1)
plt.axis([xmin, xmax, ymin, ymax])
plt.title("Hexagon binning")
plt.xlabel('x coordinates')
plt.ylabel('y coordinates')
cb = plt.colorbar()
cb.set_label('density')
plt.show()
def check_z(row):
if row[2] == 277:
return True
return False
index_true = np.where(np.apply_along_axis(check_z, 1, data_density_sample))
ddsZ = data_density_sample[index_true]
plt.hexbin(ddsZ[:,0], ddsZ[:,1], cmap=plt.cm.YlOrRd_r,mincnt=1)
plt.axis([xmin, xmax, ymin, ymax])
plt.title("Hexagon binning")
plt.xlabel('x coordinates')
plt.ylabel('y coordinates')
cb = plt.colorbar()
cb.set_label('density')
plt.show()
def check_z(row):
if row[2] == 388:
return True
return False
index_true = np.where(np.apply_along_axis(check_z, 1, data_density_sample))
ddsZ = data_density_sample[index_true]
plt.hexbin(ddsZ[:,0], ddsZ[:,1], cmap=plt.cm.YlGnBu,mincnt=1)
plt.axis([xmin, xmax, ymin, ymax])
plt.title("Hexagon binning")
plt.xlabel('x coordinates')
plt.ylabel('y coordinates')
cb = plt.colorbar()
cb.set_label('density')
plt.show()
... something isnt right though....
In [47]:
def check_spike(row):
if row[3] > 0.0013 and row[3] < 0.00135 :
return True
return False
index_true = np.where(np.apply_along_axis(check_z, 1, data_density))
spike = data_density[index_true]
print "Spike"
print "length: ", len(spike)
print "Mean: ", np.mean(spike)
print "Median: ", np.mean(spike)
print "STD: ", np.std(spike)
In [ ]:
In [ ]:
In [ ]: