Assume that synaptic density (synpases/unmasked), Y, follows some joint distribution $F_{Y \mid X}$ where $X$ is the set of data, which are vectors in $\mathbb{R}^3$ and its elements correspond, respectively, to x, y, z coordinates given by the data.
Let the true values of density correspond to the set Y, and let the joint distribution be parameterized by $\theta$. So for each $x_i \in X \textrm{ and } y_i \in Y \ , F(x;\theta)=y$.
We want to find parameters $\hat \theta$ such that we minimize a loss function $l(\hat y, y)$, where $\hat y = F(x;\hat \theta)$.
In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import urllib2
from __future__ import division
np.random.seed(1)
url = ('https://raw.githubusercontent.com/Upward-Spiral-Science'
'/data/master/syn-density/output.csv')
data = urllib2.urlopen(url)
csv = np.genfromtxt(data, delimiter=",")[1:] # don't want first row (labels)
# chopping data based on thresholds on x and y coordinates
x_bounds = (409, 3529)
y_bounds = (1564, 3124)
def check_in_bounds(row, x_bounds, y_bounds):
if row[0] < x_bounds[0] or row[0] > x_bounds[1]:
return False
if row[1] < y_bounds[0] or row[1] > y_bounds[1]:
return False
if row[3] == 0:
return False
return True
indices_in_bound, = np.where(np.apply_along_axis(check_in_bounds, 1, csv, x_bounds, y_bounds))
data_thresholded = csv[indices_in_bound]
n = data_thresholded.shape[0]
data = data_thresholded
data[:, -2] = data[:,-1]/data[:,-2]
data = data[:, :-1]
print data[:, -1]
# normalize density so they're not super tiny values
data[:, -1] -= np.average(data[:, -1])
data[:, -1] /= np.std(data[:, -1])
print data[:, -1]
In [2]:
mins = [np.min(csv[:,i]) for i in xrange(4)]
maxs = [np.max(csv[:,i]) for i in xrange(4)]
domains = zip(mins, maxs)
# sample sizes
S = np.logspace(2.0, 4.0, num=20, base=10.0, dtype='int')
null_X = np.array([[np.random.randint(*domains[i]) for i in xrange(3)]
for k in xrange(S[-1])])
null_Y = np.random.uniform(*domains[-1], size=S[-1])
print null_X.shape, null_Y.shape
In [13]:
# load our regressions
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.pipeline import Pipeline
from sklearn import cross_validation
names = ['Linear Regression','SVR','KNN Regression','Random Forest Regression','Polynomial Regression']
regressions = [LinearRegression(),
LinearSVR(C=1.0),
KNN(n_neighbors=10, algorithm='auto'),
RF(max_depth=5, max_features=1),
Pipeline([('poly', PF(degree=2)),('linear', LinearRegression(fit_intercept=False))])]
In [3]:
r2 = np.zeros((len(S), len(regressions), 2), dtype=np.dtype('float64'))
#iterate over sample sizes and regression algos
for i, N in enumerate(S):
# Randomly sample from synthetic data with sample size N
a = np.random.permutation(np.arange(S[-1]))[:N]
X = null_X[a]
Y = null_Y[a]
Y = np.ravel(Y)
print "Sample size = ", N
for k, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=10)
r2[i, k, :] = [scores.mean(), scores.std()]
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[k], scores.mean(), scores.std() * 2))
Now graphing this data:
In [4]:
plt.errorbar(S, r2[:,0,0], yerr = r2[:,0,1], hold=True, label=names[0])
plt.errorbar(S, r2[:,1,0], yerr = r2[:,1,1], color='green', hold=True, label=names[1])
plt.errorbar(S, r2[:,2,0], yerr = r2[:,2,1], color='red', hold=True, label=names[2])
plt.errorbar(S, r2[:,3,0], yerr = r2[:,3,1], color='black', hold=True, label=names[3])
plt.errorbar(S, r2[:,4,0], yerr = r2[:,4,1], color='brown', hold=True, label=names[4])
plt.xscale('log')
plt.axhline(1, color='red', linestyle='--')
plt.xlabel('Sample size')
plt.ylabel('R^2 Score')
plt.title('Regression results on simulated data under the null')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
Here we want assume a conditional dependence. Let's keep the x, y, z uniformly distributed across the sample space, but let density, $y_i$, be the sum of a deterministic function, $f:\mathbb{R}^3 \rightarrow \mathbb{R}$, and $\epsilon$ some Gaussian noise with low std dev and 0 mean. Let $f(x,y,z)=x+y+z$ (normalized over the average of all f). Let the variance of $\epsilon$ be .01.
In [5]:
# X under the alt same as under the null
alt_X = null_X
f_X = np.apply_along_axis(lambda r: reduce(lambda x,y:x+y, r)/3, 1, alt_X)
f_X -= np.average(f_X)
f_X /= np.std(f_X)
alt_Y = np.random.normal(0, .01, size=f_X.shape)+f_X
print alt_Y.shape
In [6]:
r2 = np.zeros((len(S), len(regressions), 2), dtype=np.dtype('float64'))
#iterate over sample sizes and regression algos
for i, N in enumerate(S):
# Randomly sample from synthetic data with sample size N
a = np.random.permutation(np.arange(S[-1]))[:N]
X = alt_X[a]
Y = alt_Y[a]
Y = np.ravel(Y)
print "Sample size = ", N
for k, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=10)
r2[i, k, :] = [scores.mean(), scores.std()]
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[k], scores.mean(), scores.std() * 2))
Now graphing it:
In [7]:
plt.errorbar(S, r2[:,0,0], yerr = r2[:,0,1], hold=True, label=names[0])
plt.errorbar(S, r2[:,1,0], yerr = r2[:,1,1], color='green', hold=True, label=names[1])
plt.errorbar(S, r2[:,2,0], yerr = r2[:,2,1], color='red', hold=True, label=names[2])
plt.errorbar(S, r2[:,3,0], yerr = r2[:,3,1], color='black', hold=True, label=names[3])
plt.errorbar(S, r2[:,4,0], yerr = r2[:,4,1], color='brown', hold=True, label=names[4])
plt.xscale('log')
plt.axhline(1, color='red', linestyle='--')
plt.xlabel('Sample size')
plt.ylabel('R^2 Score')
plt.title('Regression results on simulated data under the alternate')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
In [8]:
X = data[:, (0, 1, 2)]
Y = data[:, -1]
for i, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=10)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[i], scores.mean(), scores.std() * 2))
In [9]:
n_neighbors = np.arange(1, 50)
r2 = []
for n in n_neighbors:
reg = KNN(n_neighbors=n, algorithm='auto')
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=10)
r2.append(np.array([scores.mean(), scores.std()]))
r2 = np.array(r2)
plt.errorbar(n_neighbors, r2[:,0], yerr = r2[:,1])
plt.title("Number of neighbors against R^2 for KNN Regression")
plt.xlabel("number of neighbors")
plt.ylabel("R^2")
plt.show()
print "mean r^2 maximized at: ", np.argmax(r2[:,0])+1
print "variance minimized at: ", np.argmin(r2[:,1])+1
In [10]:
depth = np.arange(1, 20)
r2 = []
for d in depth:
reg = RF(max_depth=d, max_features=1)
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=10)
r2.append(np.array([scores.mean(), scores.std()]))
r2 = np.array(r2)
plt.errorbar(depth, r2[:,0], yerr = r2[:,1])
plt.title("Max depth against R^2 for RandomForestRegression")
plt.xlabel("Max depth")
plt.ylabel("R^2")
plt.show()
print "mean r^2 maximized at: ", np.argmax(r2[:,0])+1
print "variance minimized at: ", np.argmin(r2[:,1])+1
In [11]:
features = np.arange(1, 4)
r2 = []
for f in features:
reg = RF(max_depth=10, max_features=f)
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=10)
r2.append(np.array([scores.mean(), scores.std()]))
print("R^2 of %s: %0.2f (+/- %0.2f)" % ('RF', scores.mean(), scores.std() * 2))
r2 = np.array(r2)
In [12]:
# boost number of neighbors for KNN and max depth for random forest
regressions = [KNN(n_neighbors=29, algorithm='auto'),
RF(max_depth=10, max_features=1)]
names = ['KNN Regression', 'Random Forest Regression']
for i, reg in enumerate(regressions):
scores = cross_validation.cross_val_score(reg, X, Y, scoring='r2', cv=10)
print("R^2 of %s: %0.2f (+/- %0.2f)" % (names[i], scores.mean(), scores.std() * 2))