In [1]:

    
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
%matplotlib inline 
import numpy as np
import urllib2
import scipy.stats as stats

np.set_printoptions(precision=3, suppress=True)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
       '/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # don't want first row (labels)

# chopping data based on thresholds on x and y coordinates
x_bounds = (409, 3529)
y_bounds = (1564, 3124)

def check_in_bounds(row, x_bounds, y_bounds):
    if row[0] < x_bounds[0] or row[0] > x_bounds[1]:
        return False
    if row[1] < y_bounds[0] or row[1] > y_bounds[1]:
        return False
    if row[3] == 0:
        return False
    
    return True

indices_in_bound, = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_thresholded = csv[indices_in_bound]
n = data_thresholded.shape[0]


def synapses_over_unmasked(row):
    s = (row[4]/row[3])*(64**3)
    return [row[0], row[1], row[2], s]

syn_unmasked = np.apply_along_axis(synapses_over_unmasked, 1, data_thresholded)
syn_normalized = syn_unmasked

1) Bic keeps getting better as we go up right now--let's look at more clusters!



In [2]:

    
import sklearn.mixture as mixture

# Randomly Sample
samples = 10000
perm = np.random.permutation(xrange(1, len(syn_normalized[:])))
syn_normalized_sample = syn_normalized[perm[:samples]]
bics = []
max_clusters = 30
for i in range(1,15):
    print ("\ntest: bic loop #%d: " %(i)),
    bic = np.array([])
    i = np.array(range(1, max_clusters))
    print ("cluster loop: "),
    for idx in range(1, max_clusters):
        print ("%d, " %(idx)),
        gmm = mixture.GMM(n_components=idx, n_iter=1000, covariance_type='diag')
        gmm.fit(syn_normalized_sample)
        bic = np.append(bic, gmm.bic(syn_normalized_sample))
    bics.append(bic)
bic = np.asarray(bics)
bic_mean = np.max(bic,0)

plt.figure(figsize=(7,7))
plt.plot(i, 1.0/bic_mean)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.show()









    



test: bic loop #1:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #2:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #3:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #4:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #5:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #6:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #7:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #8:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #9:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #10:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #11:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #12:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #13:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  
test: bic loop #14:  cluster loop:  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,

Weird... maybe futher analysis will help us understand what we're seeing.

2) Further analysis on clusters



In [3]:

    
# From the BIC curve above, we see an elbow at 27 clusters. (Not sure what to do about the weird spike)

n_clusters = 27
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag')
clusters = [[] for i in xrange(n_clusters)]
predicted = gmm.fit_predict(syn_normalized_sample)
for label, row in zip(predicted, syn_normalized[:,]):
    clusters[label].append(row)

for i in xrange(n_clusters):
    clusters[i] = np.array(clusters[i])
    print "# of samples in cluster %d: %d" % (i+1, len(clusters[i])) 
    print "centroid: ", np.average(clusters[i], axis=0)
    print "cluster covariance: "
    covar = np.cov(clusters[i].T)
    print covar
    print "determinant of covariance matrix: ", np.linalg.det(covar)
    print









    



# of samples in cluster 1: 428
centroid:  [  832.806  2363.5     602.738   302.22 ]
cluster covariance: 
[[  60210.972   -6092.905    1564.026     320.497]
 [  -6092.905  198666.822   -1601.831  -19993.436]
 [   1564.026   -1601.831  128754.976   -1732.933]
 [    320.497  -19993.436   -1732.933   11209.053]]
determinant of covariance matrix:  1.40705416515e+19

# of samples in cluster 2: 176
centroid:  [  801.216  2343.778   630.182   304.045]
cluster covariance: 
[[  68813.793   -9794.746   -8158.759     371.343]
 [  -9794.746  224994.962    5298.246  -18327.174]
 [  -8158.759    5298.246  127869.578   -1995.028]
 [    371.343  -18327.174   -1995.028   12582.181]]
determinant of covariance matrix:  2.15949803107e+19

# of samples in cluster 3: 250
centroid:  [  825.832  2320.132   590.908   305.738]
cluster covariance: 
[[  70379.321   -1855.351    6042.936   -2174.16 ]
 [  -1855.351  202783.898   16249.988  -14877.754]
 [   6042.936   16249.988  120221.489   -2075.849]
 [  -2174.16   -14877.754   -2075.849    9431.372]]
determinant of covariance matrix:  1.39513141814e+19

# of samples in cluster 4: 194
centroid:  [  829.155  2333.144   596.268   295.657]
cluster covariance: 
[[  64780.577    -703.344    7593.844   -1149.374]
 [   -703.344  216336.394   -5331.184  -19989.247]
 [   7593.844   -5331.184  130681.182    2095.882]
 [  -1149.374  -19989.247    2095.882   11640.552]]
determinant of covariance matrix:  1.77117749149e+19

# of samples in cluster 5: 278
centroid:  [  817.658  2357.468   596.824   306.679]
cluster covariance: 
[[  61231.894   -7269.656    2934.777    2160.359]
 [  -7269.656  232097.181   11133.433  -20649.67 ]
 [   2934.777   11133.433  123791.922   -1010.35 ]
 [   2160.359  -20649.67    -1010.35    11558.645]]
determinant of covariance matrix:  1.68759129333e+19

# of samples in cluster 6: 690
centroid:  [  817.822  2366.326   604.048   298.32 ]
cluster covariance: 
[[  64792.153   -8443.052   -4179.109     316.379]
 [  -8443.052  207904.313    1892.327  -17127.296]
 [  -4179.109    1892.327  119830.504     250.48 ]
 [    316.379  -17127.296     250.48    11107.512]]
determinant of covariance matrix:  1.55275077358e+19

# of samples in cluster 7: 414
centroid:  [  798.529  2358.413   614.826   300.662]
cluster covariance: 
[[  65100.787    8749.793    -972.532     681.609]
 [   8749.793  211954.689   10611.268  -20466.752]
 [   -972.532   10611.268  125215.292   -1214.482]
 [    681.609  -20466.752   -1214.482   10307.702]]
determinant of covariance matrix:  1.41887376059e+19

# of samples in cluster 8: 309
centroid:  [  843.175  2327.466   602.816   303.36 ]
cluster covariance: 
[[  64650.203  -13158.715   -2394.25     3814.285]
 [ -13158.715  201816.269    8595.047  -19436.905]
 [  -2394.25     8595.047  114757.534      34.274]
 [   3814.285  -19436.905      34.274   11925.666]]
determinant of covariance matrix:  1.46384223367e+19

# of samples in cluster 9: 165
centroid:  [  807.273  2262.455   583.091   319.531]
cluster covariance: 
[[  53750.431  -12528.015   -1386.208    1789.538]
 [ -12528.015  195927.396    -491.932  -15049.606]
 [  -1386.208    -491.932  103999.986    -768.676]
 [   1789.538  -15049.606    -768.676   10125.089]]
determinant of covariance matrix:  9.65296925543e+18

# of samples in cluster 10: 520
centroid:  [  799.6    2319.1     607.652   303.709]
cluster covariance: 
[[  59550.16    -7847.927    -874.398    -103.099]
 [  -7847.927  201516.472   -1943.655  -17911.636]
 [   -874.398   -1943.655  127738.794   -2544.765]
 [   -103.099  -17911.636   -2544.765   10821.909]]
determinant of covariance matrix:  1.39666540325e+19

# of samples in cluster 11: 427
centroid:  [  832.337  2337.15    633.396   306.512]
cluster covariance: 
[[  66906.14    -4459.065   -1838.627    1196.822]
 [  -4459.065  210989.931    6989.485  -19373.038]
 [  -1838.627    6989.485  134635.282   -2874.695]
 [   1196.822  -19373.038   -2874.695   10155.401]]
determinant of covariance matrix:  1.57789356133e+19

# of samples in cluster 12: 250
centroid:  [  820.684  2306.248   626.428   305.431]
cluster covariance: 
[[  60447.325    8494.095   14037.574   -1662.36 ]
 [   8494.095  202664.051    4273.644  -21590.257]
 [  14037.574    4273.644  130905.627    1378.577]
 [  -1662.36   -21590.257    1378.577   11383.909]]
determinant of covariance matrix:  1.40540955299e+19

# of samples in cluster 13: 496
centroid:  [  836.349  2301.619   629.917   302.998]
cluster covariance: 
[[  63175.613   -2915.467     872.518    1187.597]
 [  -2915.467  215383.655   11865.096  -14860.063]
 [    872.518   11865.096  120497.644     100.907]
 [   1187.597  -14860.063     100.907   11668.83 ]]
determinant of covariance matrix:  1.73071689955e+19

# of samples in cluster 14: 219
centroid:  [  831.589  2308.205   595.301   309.447]
cluster covariance: 
[[  60254.5     -3760.924   -4304.518   -1067.356]
 [  -3760.924  208631.806    6640.121  -24802.359]
 [  -4304.518    6640.121  129605.606     584.779]
 [  -1067.356  -24802.359     584.779   11665.943]]
determinant of covariance matrix:  1.40500839118e+19

# of samples in cluster 15: 116
centroid:  [  830.603  2318.784   643.491   293.574]
cluster covariance: 
[[  63668.102  -26627.078   13086.301    4696.787]
 [ -26627.078  199085.788  -19475.632  -16715.491]
 [  13086.301  -19475.632  135470.965    3443.526]
 [   4696.787  -16715.491    3443.526   12481.063]]
determinant of covariance matrix:  1.7281463874e+19

# of samples in cluster 16: 447
centroid:  [  832.94   2311.282   607.268   299.875]
cluster covariance: 
[[  65022.662    2231.513   -5459.076   -1950.209]
 [   2231.513  206475.418    1434.314  -22464.194]
 [  -5459.076    1434.314  113119.192   -1881.152]
 [  -1950.209  -22464.194   -1881.152   11487.087]]
determinant of covariance matrix:  1.35566037859e+19

# of samples in cluster 17: 407
centroid:  [  813.565  2327.135   598.273   300.777]
cluster covariance: 
[[  58604.33      991.758   -5341.315    -206.226]
 [    991.758  209230.753   -3055.833  -20409.917]
 [  -5341.315   -3055.833  127593.879   -2280.529]
 [   -206.226  -20409.917   -2280.529   11293.701]]
determinant of covariance matrix:  1.44105337675e+19

# of samples in cluster 18: 504
centroid:  [  838.077  2327.673   618.589   294.546]
cluster covariance: 
[[  62203.753    3892.97      739.481    -827.046]
 [   3892.97   214683.751   -4197.087  -20223.722]
 [    739.481   -4197.087  120417.129    3900.364]
 [   -827.046  -20223.722    3900.364   11586.341]]
determinant of covariance matrix:  1.53684896851e+19

# of samples in cluster 19: 258
centroid:  [  812.     2353.221   623.337   306.84 ]
cluster covariance: 
[[  63949.074    5632.237    1274.556     -15.25 ]
 [   5632.237  214683.395   -2043.717  -21445.506]
 [   1274.556   -2043.717  125380.567   -1820.674]
 [    -15.25   -21445.506   -1820.674   12082.582]]
determinant of covariance matrix:  1.6999708308e+19

# of samples in cluster 20: 328
centroid:  [  835.503  2332.11    619.476   296.749]
cluster covariance: 
[[  63443.168   -8681.34   -10315.561    -118.739]
 [  -8681.34   219003.126   -3699.685  -22501.096]
 [ -10315.561   -3699.685  125531.388    1546.306]
 [   -118.739  -22501.096    1546.306   11939.383]]
determinant of covariance matrix:  1.64203797265e+19

# of samples in cluster 21: 238
centroid:  [  858.155  2340.723   584.815   293.974]
cluster covariance: 
[[  64263.423  -10214.855   -2851.157    2172.414]
 [ -10214.855  221002.708   17726.29   -20514.002]
 [  -2851.157   17726.29   128395.628     220.278]
 [   2172.414  -20514.002     220.278   11308.289]]
determinant of covariance matrix:  1.67324121774e+19

# of samples in cluster 22: 663
centroid:  [  820.     2334.235   611.674   302.122]
cluster covariance: 
[[  64320.308   -4148.551    4489.967     524.372]
 [  -4148.551  212081.709   -4463.03   -20653.191]
 [   4489.967   -4463.03   126520.09      589.11 ]
 [    524.372  -20653.191     589.11    10991.737]]
determinant of covariance matrix:  1.54305679588e+19

# of samples in cluster 23: 837
centroid:  [  832.362  2315.158   622.333   306.587]
cluster covariance: 
[[  62414.686   -1156.188    -349.818   -1303.747]
 [  -1156.188  218116.533   -5241.519  -21065.364]
 [   -349.818   -5241.519  115585.522    -961.105]
 [  -1303.747  -21065.364    -961.105   11308.738]]
determinant of covariance matrix:  1.44957551316e+19

# of samples in cluster 24: 313
centroid:  [  815.696  2327.304   619.22    308.369]
cluster covariance: 
[[  58873.584    1961.541   -3442.817   -2592.609]
 [   1961.541  241530.084   -7171.558  -23568.744]
 [  -3442.817   -7171.558  125889.038    2931.486]
 [  -2592.609  -23568.744    2931.486   10793.857]]
determinant of covariance matrix:  1.49147991949e+19

# of samples in cluster 25: 538
centroid:  [  825.967  2343.638   609.175   304.286]
cluster covariance: 
[[  66337.072     437.485    2311.749     -94.123]
 [    437.485  220493.885    5134.846  -24070.131]
 [   2311.749    5134.846  121603.228     605.146]
 [    -94.123  -24070.131     605.146   11448.027]]
determinant of covariance matrix:  1.56427866252e+19

# of samples in cluster 26: 392
centroid:  [  820.589  2363.401   614.247   303.107]
cluster covariance: 
[[  57529.102   -5978.761    3074.547    1296.008]
 [  -5978.761  213624.635    3925.31   -20819.72 ]
 [   3074.547    3925.31   123727.609    1268.254]
 [   1296.008  -20819.72     1268.254   10831.121]]
determinant of covariance matrix:  1.327308841e+19

# of samples in cluster 27: 143
centroid:  [  827.364  2288.909   624.748   302.669]
cluster covariance: 
[[  68898.768  -22708.861   13846.149    3214.916]
 [ -22708.861  196280.309    2738.829  -22550.454]
 [  13846.149    2738.829  112665.612     340.563]
 [   3214.916  -22550.454     340.563    9772.481]]
determinant of covariance matrix:  1.02271445057e+19



In [4]:

    
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 7))
ax = fig.gca(projection='3d')
ax.view_init()
ax.dist = 10  # distance
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
ax.set_title('Scatter Plot of GMM with 24 Clusters')
ax.set_xticks(np.arange(min(syn_normalized_sample[:,0]), max(syn_normalized_sample[:,0])+1, 800))
ax.set_yticks(np.arange(min(syn_normalized_sample[:,1]), max(syn_normalized_sample[:,1])+1, 500))
ax.set_zticks(np.arange(min(syn_normalized_sample[:,2]), max(syn_normalized_sample[:,2])+1, 300))

ax.scatter(
           syn_normalized_sample[:, 0], syn_normalized_sample[:, 1], syn_normalized_sample[:, 2],  # data
           c=predicted,  # marker colour
           alpha=.5
    )

plt.show()

3) Looking at the "spike" from last time--PCA Analysis



In [5]:

    
# Extract the spike
from mpl_toolkits.mplot3d import axes3d

a = np.apply_along_axis(lambda x:x[4]/x[3], 1, data_thresholded)

# Spike
spike = a[np.logical_and(a <= 0.0015, a >= 0.0012)]
print "Points in spike: ", len(spike)
print "Average Density: ", np.mean(spike)
print "Std Deviation: ", np.std(spike)

# Histogram
hist_n, bins, _ = plt.hist(spike, 2000)
plt.title('Histogram of Synaptic Density')
plt.xlabel('Synaptic Density (syn/voxel)')
plt.ylabel('frequency')

# Scatter plot
data_thresholded[:,4] = a
spike_coords = data_thresholded[np.logical_and(data_thresholded[:,4] <= 0.0015, data_thresholded[:,4] >= 0.0012)]









    



Points in spike:  11465
Average Density:  0.00134070207006
Std Deviation:  8.46720771375e-05



In [6]:

    
from sklearn.decomposition import PCA
import sklearn.mixture as mixture

# center each variable and give it unit variance for PCA

def center(row, means, std_devs):
    for idx, mean, std_dev in zip(range(4), means, std_devs):
        row[idx] -= mean
        row[idx] *= 1.0/std_dev
    return row

syn_centered = np.apply_along_axis(center, 1, spike_coords,
                                   *zip(*[(np.average(spike_coords[:, i]), 
                                   np.std(spike_coords[:, i])) for i in range(4)]))
pca = PCA(n_components = 4)
transform = pca.fit_transform(syn_centered)
print pca.components_
print pca.explained_variance_ratio_
print transform.shape

# plot the clusters along the first 2 principal components
n_clusters = 27
gmm = mixture.GMM(n_components=n_clusters, n_iter=1000, covariance_type='diag')
predicted = gmm.fit_predict(spike_coords)

plt.scatter(transform[:, 0], transform[:, 1], c=predicted, alpha=.3)
plt.title('2D PCA when there are 27 clusters')
plt.xlabel('x-coordinate transformed')
plt.ylabel('y-coordinate transformed')
plt.show()









    



[[ 0.049 -0.476  0.467  0.743  0.   ]
 [-0.346 -0.693 -0.633 -0.023  0.   ]
 [ 0.921 -0.136 -0.357  0.077  0.   ]
 [-0.173  0.525 -0.504  0.664 -0.   ]]
[ 0.311  0.265  0.25   0.173]
(11465L, 4L)

Layer boundaries seem to be fairly well defined. Layers in along y-direction across x-direction.

4) Regressions on the "spike"



In [7]:

    
# Regression (x,y,z,syn/unmasked) on cleaned data ##################################
# Load regressions
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
names = ['Linear Regression','SVR','KNN Regression','Random Forest Regression','Polynomial Regression']
regressions = [LinearRegression(),
               LinearSVR(C=1.0),
               KNN(n_neighbors=10, algorithm='auto'),
               RF(max_depth=5, max_features=1),
               Pipeline([('poly', PF(degree=2)),('linear', LinearRegression(fit_intercept=False))])]
k_fold = 10

print('Regression on spike') # old: X=(x,y,z), Y=syn/unmasked')
X = spike_coords[:, (0,1,2)] # x,y,z
Y = spike[:] # syn/unmasked from spike
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))
    
print "regression done"









    



Regression on spike
R^2 of Linear Regression: 0.01 (+/- 0.07)
R^2 of SVR: -1.18 (+/- 1.31)
R^2 of KNN Regression: -0.04 (+/- 0.09)
R^2 of Random Forest Regression: -0.01 (+/- 0.04)
R^2 of Polynomial Regression: 0.03 (+/- 0.07)
regression done



In [8]:

    
print('Regressions on x and spike')
X = spike_coords[:, [0]] # x,y,z
Y = spike[:] # syn/unmasked from spike
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))

# y 
print
print('Regression on y and spike')
X = spike_coords[:, [1]] # x,y,z
Y = spike[:] # syn/unmasked from spike
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))

# z
print
print('Regression on z and spike')
X = spike_coords[:, [2]] # x,y,z
Y = spike[:] # syn/unmasked from spike
for idx2, reg in enumerate(regressions):
    scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=k_fold)
    print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[idx2], scores.mean(), scores.std() * 2))









    



Regressions on x and spike
R^2 of Linear Regression: -0.01 (+/- 0.04)
R^2 of SVR: -0.75 (+/- 1.73)
R^2 of KNN Regression: -0.21 (+/- 0.43)
R^2 of Random Forest Regression: -0.01 (+/- 0.04)
R^2 of Polynomial Regression: -0.01 (+/- 0.03)

Regression on y and spike
R^2 of Linear Regression: 0.01 (+/- 0.06)
R^2 of SVR: -0.93 (+/- 1.14)
R^2 of KNN Regression: -0.09 (+/- 0.09)
R^2 of Random Forest Regression: -0.01 (+/- 0.04)
R^2 of Polynomial Regression: 0.01 (+/- 0.07)

Regression on z and spike
R^2 of Linear Regression: -0.01 (+/- 0.04)
R^2 of SVR: -0.70 (+/- 0.97)
R^2 of KNN Regression: -0.09 (+/- 0.10)
R^2 of Random Forest Regression: -0.01 (+/- 0.03)
R^2 of Polynomial Regression: -0.00 (+/- 0.04)

5) We want to take a closer look at the strange z-values from before



In [2]:

    
#re-import data in Bijan's format
np.set_printoptions(precision=3, suppress=True)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
       '/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:]

def check_condition(row):
    if row[3] == 0:
        return False
    return True

a = np.apply_along_axis(check_condition, 1, csv)
a = np.where(a == True)[0]
nonZeroMask = csv[a, :]

synDividedMask = np.divide(nonZeroMask[:,4],nonZeroMask[:,3])
synDividedMask = synDividedMask * (64**3)
accurateDataT = np.vstack((nonZeroMask[:,0],nonZeroMask[:,1],nonZeroMask[:,2],synDividedMask))
accurateData = accurateDataT.T

cleaned = accurateData[accurateData[:,0] >= 409]
cleaned = cleaned[cleaned[:,0] <= 3529]
cleaned = cleaned[cleaned[:,1] >= 1564]
cleaned = cleaned[cleaned[:,1] <= 3124]
#finished import



In [10]:

    
import math
divisions = np.unique(cleaned[:,2])
fig, ax = plt.subplots(1,2,sharey = True, figsize=(20,5))
for d in divisions:
    z_layer = cleaned[cleaned[:,2] == d]
    z_ind = (d + 56) / 111
    if (z_ind == 2 or z_ind == 4):
        weights = np.ones_like(z_layer[:,3])/len(z_layer[:,3])
        weights2 = np.ones_like(cleaned[:,3])/len(cleaned[:,3])
        ax[int(z_ind/2) -1].hist(z_layer[:,3], bins = 100, alpha = 0.5, weights = weights, label = 'z')
        ax[int(z_ind/2) -1].hist(cleaned[:,3], bins = 100, alpha = 0.5, weights = weights2, label = 'all data')
        ax[int(z_ind/2) -1].legend(loc='upper right')
        title = 'z layer, ' + str(int(d))
        ax[int(z_ind/2) -1].set_title(title)
        #ax[int(z_ind/2) -1].set_yticks(np.arange(0,150,30))
        #ax[int(z_ind/2) -1].set_xticks(np.arange(0,800,200))
plt.show()



In [ ]: