In [1]:
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'spectral'
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
In [2]:
np.random.seed(0)
N = 500 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
ix = range(N*j,N*(j+1))
r = np.linspace(0.05,1,N) # radius
t = np.linspace(j*(K+1),(j+1)*(K+1),N) + np.random.randn(N)*0.2 # theta
X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
y[ix] = j
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40,linewidths=0)#, cmap=plt.cm.Spectral)
plt.xlim([-r[-1]*1.1,r[-1]*1.1])
plt.ylim([-r[-1]*1.1,r[-1]*1.1])
#fig.savefig('spiral_raw.png')
Out[2]:
In [939]:
#Train a Linear Classifier
# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# gradient descent loop
num_examples = X.shape[0]
for i in xrange(200):
# evaluate class scores, [N x K]
scores = np.dot(X, W) + b
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
corect_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W)
loss = data_loss + reg_loss
if i % 10 == 0:
print "iteration %d: loss %f" % (i, loss)
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y] -= 1
dscores /= num_examples
# backpropate the gradient to the parameters (W,b)
dW = np.dot(X.T, dscores)
db = np.sum(dscores, axis=0, keepdims=True)
dW += reg*W # regularization gradient
# perform a parameter update
W += -step_size * dW
b += -step_size * db
In [940]:
# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
In [1924]:
In [3]:
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
tanh = lambda x: np.tanh(x)
In [4]:
import numpy.random as npr
In [1097]:
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_linear.png')
Out[1097]:
In [1098]:
random_starts = []
for i in range(1000):
h = 30 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
random_starts.append(mats_to_vec(W,b,W2,b2))
random_starts = np.array(random_starts)
In [1099]:
pdist = distance.pdist(random_starts)
dist_mat = distance.squareform(pdist)
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between random parameter initializations')
plt.savefig('l2-pairwise-param-vec-distances-30h-random.jpg',dpi=600)
In [ ]:
In [5]:
def train_net(h=30,D=D,K=K,num_iter=10000):
# initialize parameters randomly
#h = 30 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# keep a record of training progress
params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
num_params = len(params)
run_record = np.zeros((num_iter,num_params))
loss_record = np.zeros(num_iter)
# stochastic gradient descent loop
#mini_batch_size = len(X)
mini_batch_size=100
for i in xrange(num_iter):
# minibatch
if mini_batch_size < len(X):
mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
X_ = X[mask]
y_ = y[mask]
else:
X_ = X[:]
y_ = y[:]
num_examples = len(X)#X_.shape[0]
# evaluate class scores, [N x K]
#hidden_layer = tanh(np.dot(X, W) + b)
#hidden_layer = sigmoid(np.dot(X, W) + b)
hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
# compute full loss:
compute_full_loss=False
if mini_batch_size == len(X) or compute_full_loss:
corect_logprobs = -np.log(probs[range(len(X)),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
loss_record[i] = loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y] -= 1
dscores /= num_examples
# compute minibatch loss for gradient purposes:
if mini_batch_size < len(X):
num_examples = X_.shape[0]
hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
corect_logprobs = -np.log(probs[range(num_examples),y_])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y_] -= 1
dscores /= num_examples
# backpropate the gradient to the parameters
# first backprop into parameters W2 and b2
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0, keepdims=True)
# next backprop into hidden layer
dhidden = np.dot(dscores, W2.T)
# backprop the ReLU non-linearity
dhidden[hidden_layer <= 0] = 0
# finally into W,b
dW = np.dot(X_.T, dhidden)
db = np.sum(dhidden, axis=0, keepdims=True)
# add regularization gradient contribution
dW2 += reg * W2
dW += reg * W
# perform a parameter update
W += -step_size * dW
b += -step_size * db
W2 += -step_size * dW2
b2 += -step_size * db2
run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
return run_record
In [1105]:
run_record1 = train_net()
In [6]:
run_records = []
In [7]:
# how far away are the endpoints of multiple runs?
from time import time
t = time()
for i in range(10):
run_records.append(train_net())
print(i)
print(time() - t)
In [8]:
# what let's do ICA and tICA
from sklearn.decomposition import FastICA
In [96]:
ica = FastICA(2)
In [97]:
all_runs = np.vstack([run for run in run_records])
In [98]:
ica.fit(all_runs[::10])
Out[98]:
In [99]:
c = np.hstack([np.ones(len(run))*i for i,run in enumerate(run_records)])
In [100]:
c.shape
Out[100]:
In [101]:
X_ica = ica.transform(all_runs)[::50]
In [102]:
plt.scatter(X_ica[:,0],X_ica[:,1],c=c[::50],linewidths=0)
Out[102]:
In [103]:
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(all_runs)[::50]
In [104]:
plt.scatter(X_pca[:,0],X_pca[:,1],c=c[::50],linewidths=0)
Out[104]:
In [135]:
from msmbuilder.decomposition import tICA
tica = tICA(n_components=2,lag_time=100)
In [136]:
tica.fit([run for run in run_records])
Out[136]:
In [131]:
tica.fit([run_records[0]])
Out[131]:
In [132]:
tica.components_.shape
Out[132]:
In [137]:
X_tica = tica.transform([run_records[0]])[0]
In [140]:
X_tica = tica.transform(run_records)
In [142]:
c.shape
Out[142]:
In [145]:
for i,x in enumerate(X_tica):
plt.scatter(x[::50,0],x[::50,1],linewidths=0,c=(np.ones(len(x))*i)[::50])
#plt.scatter(X_tica[:,0],X_tica[:,1],linewidths=0)
In [ ]:
In [105]:
start_points = np.array([run_record[0] for run_record in run_records])
pdist = distance.pdist(start_points)
max(pdist),min(pdist)
In [1155]:
def pairwise_distance_over_time(run_records):
n = len(run_records[0])
current_points = np.array([run_record[0] for run_record in run_records])
pdist = distance.pdist(current_points)
num_dist = len(pdist)
distances = np.zeros((n,num_dist))
for i in range(n):
current_points = np.array([run_record[i] for run_record in run_records])
pdist = distance.pdist(current_points)
distances[i] = pdist
return distances
def plot_pairwise(run_records):
distances = pairwise_distance_over_time(run_records)
plt.plot(np.mean(distances,1))
for i in range(distances.shape[1]):
plt.scatter(range(distances.shape[0]),distances[:,i],
linewidths=0,alpha=0.5)
In [1156]:
distances = pairwise_distance_over_time(run_records)
In [1160]:
plt.plot(np.mean(distances,1))
Out[1160]:
In [1164]:
distances.shape
Out[1164]:
In [1165]:
#lin_dist = distances.reshape(np.prod(distances.shape))
In [1170]:
lin_dist=np.zeros(np.prod(distances.shape))
for i in range(len(distances.T)):
lin_dist[i*distances.shape[0]:(i+1)*distances.shape[0]] = distances[:,i]
In [1166]:
iter_num = np.hstack((range(len(distances)) for _ in range(distances.shape[1])))
In [1173]:
plt.scatter(iter_num[::100],lin_dist[::100],
linewidth=0)
Out[1173]:
In [1201]:
plt.scatter(iter_num[iter_num<2000][::15],lin_dist[iter_num<2000][::15],
linewidth=0,alpha=0.005);
In [1188]:
dist_vecs = [distance.pdist(run[::10]) for run in run_records]
In [1191]:
np.array(dist_vecs).shape
Out[1191]:
In [1192]:
avg_dist_vec = np.mean(np.array(dist_vecs),0)
In [1193]:
avg_pdist = distance.squareform(avg_dist_vec)
In [1238]:
for i,dist_vec in enumerate(dist_vecs):
plt.imshow(distance.squareform(dist_vec),
interpolation='none',
cmap='Blues')
plt.savefig('distmat {0}.jpg'.format(i),dpi=600)
which examples are most typical and which examples are least typical?
In [ ]:
In [1194]:
plt.imshow(avg_pdist,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
Out[1194]:
In [1195]:
std_dist_vec = np.std(np.array(dist_vecs),0)
In [1196]:
std_pdist = distance.squareform(std_dist_vec)
In [1197]:
plt.imshow(std_pdist,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
Out[1197]:
In [1207]:
plt.hist2d(iter_num[iter_num<2000],lin_dist[iter_num<2000],bins=50,
cmap='Blues');
plt.colorbar()
Out[1207]:
In [ ]:
for i in range(distances[:,:100].shape[1]):
plt.scatter(range(distances.shape[0]),distances[:,i],
linewidths=0,alpha=0.5)
In [1151]:
end_points = np.array([run_record[-1] for run_record in run_records])
In [1152]:
end_points.shape
Out[1152]:
In [1153]:
pdist = distance.pdist(end_points)
max(pdist),min(pdist)
Out[1153]:
In [1217]:
run_records_ = np.vstack(run_records)
run_records_.shape
Out[1217]:
In [1147]:
run_records_[::50].shape
Out[1147]:
In [1221]:
all_dist = distance.pdist(run_records_[npr.rand(len(run_records_))<0.001])
In [1222]:
plt.hist(all_dist,bins=100);
In [1225]:
pca = PCA()
pca.fit(run_records_[::10])
Out[1225]:
In [1226]:
X_ = pca.transform(run_records_)
In [1227]:
pca.explained_variance_ratio_[:2]
Out[1227]:
In [1228]:
c = np.hstack([i*np.ones(len(run_records[0])) for i in range(len(run_records))])
c.shape
Out[1228]:
In [1229]:
c[::thinning].shape
Out[1229]:
In [1237]:
thinning=50
plt.scatter(X_[::thinning,0],X_[::thinning,1],
c=c[::thinning],
alpha=0.5,
linewidths=0)
plt.xlabel('PC1')
plt.ylabel('PC2')
Out[1237]:
In [1236]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_[::thinning,0],X_[::thinning,1],X_[::thinning,2],
alpha=0.5,c=c[::thinning],linewidths=0)
Out[1236]:
In [962]:
run_record.shape
Out[962]:
In [946]:
D,h,K
Out[946]:
In [947]:
l1 = lambda x,y : sum(abs(x-y))
In [948]:
l2 = lambda x,y : np.sqrt(sum((x-y)**2))
In [949]:
norm = l2
In [963]:
distance_from_start = np.array([norm(run_record[i],run_record[0]) for i in xrange(len(run_record))])
In [964]:
plt.rcParams['font.family'] = 'serif'
In [965]:
plt.plot(distance_from_start)
plt.hlines(max(distance_from_start),0,len(distance_from_start),linestyle='--')
plt.title('Distance from initial parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[965]:
In [999]:
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(run_record[i],run_record[best_index]) for i in xrange(len(run_record))])
best_index
Out[999]:
In [760]:
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')
plt.title('Distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[760]:
In [761]:
plt.plot(loss_record)
plt.title('Loss')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.ylim(0,max(loss_record))
plt.vlines(best_index,0,max(loss_record),linestyle='--')
Out[761]:
In [762]:
worst_index = np.argmax(loss_record)
distance_from_worst = np.array([norm(run_record[i],run_record[worst_index]) for i in xrange(len(run_record))])
worst_index
Out[762]:
In [763]:
plt.plot(distance_from_worst)
plt.vlines(worst_index,0,max(distance_from_worst),linestyle='--')
plt.title('Distance from worst parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[763]:
In [1001]:
# evaluate training set accuracy
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
In [1240]:
def accuracy(vec,X=X,y=y,D=D,h=h,K=K):
W,b,W2,b2 = vec_to_mats(vec,D,h,K)
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
predicted_class = np.argmax(scores, axis=1)
return (np.mean(predicted_class == y))
In [1332]:
def loss(vec,X=X,y=y,D=D,h=h,K=K):
W,b,W2,b2 = vec_to_mats(vec,D,h,K)
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
num_examples = len(X)
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
corect_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss=0
#reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
return data_loss + reg_loss
In [1255]:
D
Out[1255]:
In [1256]:
run_records[0][0].shape
Out[1256]:
In [1258]:
%timeit loss(run_records[0][0])
In [1260]:
interp = lambda initial,final,alpha=0.5: (1-alpha)*initial + alpha*final
In [1372]:
dim = len(end_points[0])
In [1373]:
alpha= np.arange(-0.2,1.5,0.05)
inds = range(len(start_points))
In [1374]:
line_losses = [[loss(interp(start_points[i],end_points[i],a)) for a in alpha] for i in inds]
In [1375]:
line_losses = [[loss(interp(np.zeros(dim),end_points[i],a)) for a in alpha] for i in inds]
In [1604]:
line_losses = [[loss(interp(np.zeros(dim),run_records[i][-1],a)) for a in alpha] for i in inds]
In [1605]:
line_losses = np.array(line_losses)
line_losses.shape
Out[1605]:
In [1606]:
np.min(line_losses)
Out[1606]:
In [1607]:
plt.plot(line_losses[0]);
In [1608]:
best_solution = min([loss(e) for e in end_points])
best_solution
Out[1608]:
In [1615]:
[plt.plot(alpha,line_losses[i],alpha=0.3) for i in range(100)];
plt.hlines(best_solution,alpha[0],alpha[-1],linestyle='--',color='grey')
plt.vlines(1,0,np.max(line_losses),linestyle='--',color='grey')
plt.xlabel('alpha')
plt.ylabel('loss')
Out[1615]:
In [1370]:
np.mean(abs(end_points))
Out[1370]:
In [1371]:
np.mean(abs(npr.randn(1000)))
Out[1371]:
hmm so what does this look like if you pick a random direction? or take one good network and scramble its weight vector? or interpolate between two good networks?
In [1895]:
rand_end_points = npr.randn(10000,dim)
In [1918]:
line_losses = [[loss(interp(np.zeros(dim),rand_end_points[i],alpha)) for alpha in np.arange(-1,1,0.1)] for i in range(10000)]
In [1904]:
[plt.plot(line) for line in line_losses[:10]];
In [1921]:
np.min(line_losses),np.max(line_losses)
Out[1921]:
In [1902]:
np.array(line_losses).shape
Out[1902]:
In [1922]:
rand_min_losses = np.array([np.min(l) for l in line_losses])
In [1924]:
plt.hist(rand_min_losses,bins=50,normed=True,log=True);
plt.vlines(best_solution,0,130.0,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(np.min(rand_min_losses),0,130.0,linestyle='--',color='blue',label='Best random direction')
plt.legend(loc='best')
plt.xlabel('Loss')
plt.ylabel('Probability density')
plt.title('Line search in random directions doesn\'t work')
Out[1924]:
In [1416]:
scrambled_end_points = [end_points[0][sorted(range(dim),key=lambda i:npr.rand())] for _ in range(100)]
In [1418]:
np.min(line_losses)
Out[1418]:
In [1423]:
end_points.shape
Out[1423]:
In [1428]:
best_ind = np.argmin(np.array([loss(e) for e in end_points]))
best_ind
Out[1428]:
In [1429]:
loss(end_points[best_ind])
Out[1429]:
In [1437]:
line_losses = np.array([[loss(interp(np.zeros(dim),scrambled_end_points[best_ind],a)) for a in alpha] for i in inds])
In [1454]:
plt.bar(range(len(end_points.T)),sorted(np.mean(end_points,0)));
In [1482]:
stats.ttest_1samp(end_points[:,i],0)[0]
Out[1482]:
In [1486]:
from scipy import stats
sig = np.zeros(end_points.shape[1])
for i in range(end_points.shape[1]):
#mean,(low,up) = stats.bayes_mvs(end_points[:,i],alpha=0.95)[0]
##sig[i] = (low*up>0)*abs(mean / (up-low))
sig[i]=abs(stats.ttest_1samp(end_points[:,i],0)[0])
In [1487]:
plt.plot(sig)
Out[1487]:
In [1498]:
mats=vec_to_mats(sig,D,h,K)
In [1504]:
fig, axes = plt.subplots(nrows=2, ncols=2)
for i,ax in enumerate(axes.flat):
im = ax.imshow(mats[i],cmap='Blues')
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im,cax=cbar_ax)#, cax=cbar_ax)
Out[1504]:
Hmm so it look like the biases learn consistent values across multiple runs
In [1458]:
end_points[:,0].shape
Out[1458]:
In [1444]:
plt.hist([l1(np.zeros(dim),e) for e in end_points],bins=50);
In [1506]:
plt.hist([l2(np.zeros(dim),e) for e in end_points],bins=100);
In [1616]:
aligned_end_points = heuristic_alignment(end_points,D,h,K)
In [1618]:
raw_distances = distance.pdist(end_points)
aligned_distances = distance.pdist(aligned_end_points)
In [1619]:
raw_distances.mean(),aligned_distances.mean()
Out[1619]:
In [1620]:
raw_distances.min(),aligned_distances.min()
Out[1620]:
In [1510]:
plt.hist([l2(np.zeros(dim),18*npr.randn(dim)/14) for _ in range(1000)],bins=100);
In [1438]:
plt.plot(line_losses[10])
Out[1438]:
compared with the angle from start to finish, what is the angle from start to current?
In [1511]:
start_to_f = [e - i for (e,i) in zip(end_points,start_points)]
In [1522]:
e = end_points[0]
e.dot(-e)/(np.sqrt(sum(e**2))*np.sqrt(sum(e**2)))
Out[1522]:
In [1576]:
angle = lambda x,y:x.dot(y)/(np.sqrt(sum(x**2))*np.sqrt(sum(y**2)))
In [1534]:
r = run_records[0]
In [1539]:
plt.plot(angle(start_to_f[0],r.T))
Out[1539]:
In [1784]:
[plt.plot(angle(start_to_f[i],run_records[i].T),alpha=0.5) for i in range(100)];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity w.r.t. final')
Out[1784]:
In [1554]:
angle(end_points[40],end_points[41])
Out[1554]:
In [1563]:
run_records[0].shape
Out[1563]:
In [1572]:
run_records[0][:,:-lag].dot(run_records[0][:,:-lag].T).shape
Out[1572]:
In [1586]:
def lag_angle(record,lag=10):
n = len(record) - lag
angles = np.zeros(n)
for i in range(n):
angles[i] = angle(record[i],record[i+lag])
return angles
In [1587]:
run_records[:10][0].shape
Out[1587]:
In [1783]:
lag=200
[plt.plot(lag_angle(r[:1200],lag),alpha=0.5) for r in run_records];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity at lag {0}'.format(lag))
Out[1783]:
In [ ]:
In [1601]:
cosine_distances = distance.pdist(end_points,'cosine')-1
distance.squareform(cosine_distances).shape
Out[1601]:
In [1602]:
np.mean(cosine_distances),np.min(cosine_distances),np.max(cosine_distances)
Out[1602]:
In [ ]:
In [1600]:
distance.cosine(np.ones(200),np.ones(200))
Out[1600]:
In [1559]:
plt.imshow(distance.squareform(distance.pdist(r[::10],'cosine')),cmap='Blues')
Out[1559]:
How sensitive is this result to the angle? i.e. $\left|\frac{\partial \min J}{\partial \theta}\right|$?
Now it's convenient to define a loss function in terms of just the direction, since the minimum can be found easily using a coarse line search
Line loss
In [1738]:
min_line_loss = lambda vec,loss=loss,alpha=np.arange(0.0,1.5,0.075):np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])
In [1648]:
loss_as_f_alpha = lambda vec,loss=loss:lambda alpha:loss(alpha*vec)
In [ ]:
In [1656]:
from scipy import optimize
direction = end_points[best_ind]
loss_in_best_dir = loss_as_f_alpha(direction)
optimize.fmin(loss_as_f_alpha(direction),1.0)
Out[1656]:
In [1674]:
t = time()
optimize.fmin(loss_in_best_dir,1.0,maxfun=30,disp=0)
print(time() - t)
In [1645]:
%timeit min_line_loss(npr.randn(dim))
In [1646]:
min_line_loss(end_points[best_ind])
Out[1646]:
In [1631]:
def gradient(func,x0,h=0.001):
x0 = np.array(x0)#,dtype=float)
y = func(x0)
deriv = np.zeros(len(x0))
for i in range(len(x0)):
x = np.array(x0)
x[i] += h
deriv[i] = (func(x) - y)/h
return deriv
In [1635]:
%timeit gradient(min_line_loss,end_points[best_ind])
In [1638]:
new_dir = end_points[best_ind] - gradient(min_line_loss,end_points[best_ind])
In [1675]:
min_line_loss_scipy = lambda vec,loss=loss:optimize.fmin(loss_as_f_alpha(vec),1.0,maxfun=30,disp=0)
In [1677]:
gradient(min_line_loss_scipy,end_points[best_ind])[:10]
Out[1677]:
In [1678]:
min_line_loss(new_dir)
Out[1678]:
In [1687]:
npr.seed(0)
new_dir = npr.randn(dim)
directions = [new_dir]
losses = [min_line_loss(new_dir)]
for i in range(10):
new_dir -= 100*gradient(min_line_loss,new_dir)
directions.append(new_dir)
losses.append(min_line_loss(new_dir))
print(losses[-1])
In [1683]:
sum(abs(new_dir))
Out[1683]:
In [1840]:
def adaptive_metropolis_hastings(func,x0,num_steps=1000,step_size=0.1,verbose=False):
locs = np.zeros((num_steps+1,len(x0)))
vals = np.zeros(num_steps+1)
locs[0] = x0
vals[0] = func(x0)
num_rejects=0
num_accepts=0
message = ''
adaptation_rate=1.02
steps = np.zeros(num_steps+1)
steps[0] = step_size
for i in range(num_steps):
new_loc = locs[i] + npr.randn(len(locs[i]))*step_size
new_val = func(new_loc)
#accept = npr.rand() < np.exp(-(new_val - vals[i]))#/vals[i]#np.exp(-(new_val - vals[i]))
accept = new_val <= vals[i]#*(1+npr.randn()*0.01)# or (npr.rand() < 0.05)
if not accept:
new_loc = locs[i]
new_val = vals[i]
num_rejects+=1
num_accepts=0
if num_rejects > 10:
if step_size > 1e-3:
step_size/=adaptation_rate
message = message+'\n reduced step size'
else:
num_rejects=0
num_accepts+=1
if num_accepts > 10:
if step_size < 1.0:
step_size*=adaptation_rate
message = message+ '\n increased step size'
locs[i+1] = new_loc
vals[i+1] = new_val
if i % 100 ==0:
print("{0}: {1:.3}".format(i,new_val))
if verbose:
print(message)
message = ''
steps[i+1] = step_size
return locs,vals,steps
In [1841]:
np.arange(0.0,1.5,0.1).shape
Out[1841]:
In [1842]:
def min_line_loss(vec,loss=loss,alpha=np.arange(0.0,1.5,0.1)):
return np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])
In [1843]:
locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind],num_steps=2000,step_size=0.01)
#+npr.randn(dim)*0.01
In [1865]:
plt.plot(vals)
plt.hlines(vals[0],0,len(vals),linestyle='--',color='grey')
plt.ylabel('loss')
plt.xlabel('Hill-climbing step')
Out[1865]:
In [1845]:
plt.plot(steps)
Out[1845]:
In [1810]:
np.mean(abs(end_points[best_ind]))
Out[1810]:
In [1811]:
np.mean(abs(npr.randn(dim)))
Out[1811]:
In [1849]:
random_directions=npr.randn(100,dim)
In [1850]:
best_rand = np.argmin([min_line_loss(r) for r in random_directions])
In [1886]:
multiple_hc_runs_near = []
for i in range(10):
locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+npr.randn(dim)*0.5,num_steps=1000,step_size=0.01)
multiple_hc_runs_near.append((locs,vals,steps))
#locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+0.5*npr.randn(dim),num_steps=5000,step_size=0.01)
In [1879]:
# random initialization
for soln in multiple_hc_runs[:-1]:
vals = soln[1]
plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Random initialization')
Out[1879]:
In [1887]:
for soln in multiple_hc_runs_near[:-1]:
vals = soln[1]
plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs_near[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Initialized near SGD solution')
Out[1887]:
In [1890]:
hc_solns = np.array([run[0][-1] for run in multiple_hc_runs_near])
In [1892]:
hc_angles = []
for i in range(len(hc_solns)):
for j in range(i):
if i!=j:
hc_angles.append(angle(hc_solns[i],hc_solns[j]))
hc_angles = np.array(hc_angles)
In [1893]:
plt.hist(hc_angles,bins=20)
In [1864]:
plt.plot(steps)
Out[1864]:
In [1684]:
sum(abs(gradient(min_line_loss,new_dir)))
Out[1684]:
In [1634]:
plt.plot(sorted(gradient(min_line_loss,end_points[best_ind])))
Out[1634]:
In [1050]:
from scipy.spatial import distance
pdist = distance.pdist(run_record)
In [1051]:
dist_mat = distance.squareform(pdist)
In [1052]:
from matplotlib import colors
ln = colors.LogNorm()
In [1055]:
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-3h-3-class-normed.jpg',dpi=600)
In [959]:
def pairwise_distances(X):
pdist=np.zeros((len(X),len(X)))
for i in range(len(X)):
for j in range(len(X)):
#
pbc[i,j] = BC_jit(X[i],X[j])
return pbc
In [1000]:
W,b,W2,b2 = vec_to_mats(run_record[-1],D,h,K)
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = np.dot(np.maximum(0, np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b), W2) + b2
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Greys, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y,linewidths=0)#, s=40)#, cmap=plt.cm.Greys)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_net.png')
Out[1000]:
In [359]:
in_dim = 2
hidden_dim = 10
out_dim = 2
W1 = npr.randn(in_dim,hidden_dim)
W2 = npr.randn(hidden_dim,out_dim)
In [362]:
npr.randn(2).dot(W1)
Out[362]:
In [363]:
from scipy.misc import comb
In [369]:
np.prod(range(1,20))
Out[369]:
In [374]:
np.prod(W1.shape) + np.prod(W2.shape)
Out[374]:
In [375]:
parameter_vector = np.hstack((W1.reshape(np.prod(W1.shape)),W2.reshape(np.prod(W2.shape))))
In [398]:
def mats_to_vec(W,b,W2,b2):
return np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
In [412]:
def vec_to_mats(param_vector,D,h,K):
''' D is input dimensionality, h is number of hidden units, K is number of output units '''
index = 0
W = param_vector[index:index + D*h].reshape((D,h))
index += D*h
b = param_vector[index:index+h].reshape((1,h))
index += h
W2 = param_vector[index:index+h*K].reshape((h,K))
index+= h*K
b2 = param_vector[index:index +K].reshape((1,K))
return W,b,W2,b2
In [770]:
mats = vec_to_mats(run_record[0],D,h,K)
W1,b1,W2,b2 = mats
In [771]:
W1.shape,b1.shape,W2.shape,b2.shape
Out[771]:
In [533]:
def find_permutation(mats):
''' simple heuristic, permute according to the column-wise mean values of W1'''
#return range(W1.shape[1])
W1,b1,W2,b2 = mats
#return sorted(range(W1.shape[1]),key=lambda i:b1[0,i])
#return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i])+np.max(W2[i,:]))
#return sorted(range(W1.shape[1]),key=lambda i:np.max(W2[i,:]))
return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i]))
In [534]:
perm = find_permutation(mats)
In [535]:
def permuted_vector(perm,W1,b1,W2,b2):
D,h = W1.shape
K = b2.shape[1]
#perm = find_permutation(W1)
return mats_to_vec(W1[:,perm],b1[:,perm],W2[perm,:],b2)
In [536]:
def heuristic_alignment(record,D,h,K):
perm_record = np.zeros(record.shape)
for i in range(len(record)):
mats = vec_to_mats(record[i],D,h,K)
perm_record[i] = permuted_vector(find_permutation(mats),*mats)
return perm_record
In [1056]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(run_record)
plt.plot(pca.explained_variance_ratio_[:20])
Out[1056]:
In [1093]:
plt.plot(pca.transform(run_record)[:,0],label='PC1',color='blue',
linewidth=3)
plt.plot(pca.transform(run_record)[:,1],label='PC2',color='blue',
linewidth=2,alpha=0.7)
plt.plot(pca.transform(run_record)[:,2],label='PC3',color='blue',
alpha=0.5)
plt.plot(pca.transform(run_record)[:,3],label='PC4',color='blue',
linestyle='--',alpha=0.5)
plt.plot(pca.transform(run_record)[:,4],label='PC5',color='blue',
linestyle='--',alpha=0.25)
plt.plot(pca.transform(run_record)[:,5],label='PC6+',color='grey',
linestyle='--',alpha=0.1)
for i in range(6,10):
plt.plot(pca.transform(run_record)[:,i],color='grey',
linestyle='--',alpha=0.1)
plt.xlabel('Iteration #')
plt.legend(loc='best')
plt.title('Training dynamics projected onto leading principal components')
Out[1093]:
In [ ]:
In [ ]:
In [1086]:
reduced_dynamics = pca.transform(run_record)[:,:10]
In [973]:
sum(pca.explained_variance_ratio_[:3])
Out[973]:
In [974]:
run_record.shape
Out[974]:
In [975]:
plt.plot(abs(pca.components_[1]))
Out[975]:
In [976]:
plt.plot(sorted(abs(pca.components_[0])))
Out[976]:
In [977]:
sum(abs(pca.components_[0])>0.05)
Out[977]:
In [978]:
plt.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],
c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
Out[978]:
In [979]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],pca.transform(run_record)[:,2],
c=range(num_iter),linewidths=0,alpha=0.5)
#plt.colorbar()
Out[979]:
In [989]:
D=2
In [990]:
aligned_record = heuristic_alignment(run_record,D,h,K)
In [1009]:
from scipy.spatial import distance
pdist = distance.pdist(aligned_record)
In [1010]:
dist_mat = distance.squareform(pdist)
In [1011]:
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-sorted-3h-3-class.jpg',dpi=600)
In [1020]:
h=30
In [1021]:
perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in run_record]
In [ ]:
def find_most_common_perm(record, D,h,K):
perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in record]
In [1022]:
perms = np.array(perms)
In [1023]:
len([p for p in perms])
Out[1023]:
In [1024]:
len(set([tuple(p) for p in perms]))
Out[1024]:
In [1025]:
plt.plot(sorted(np.std(perms,0)))
Out[1025]:
In [1012]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(aligned_record)
plt.plot(pca.explained_variance_ratio_[:20])
Out[1012]:
In [1013]:
sum(pca.explained_variance_ratio_[:2])
Out[1013]:
In [1014]:
plt.scatter(pca.transform(aligned_record)[:,0],pca.transform(aligned_record)[:,1],
c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
Out[1014]:
In [1045]:
pca.fit(globally_aligned_record)
Out[1045]:
In [1047]:
sum(pca.explained_variance_ratio_[:3])
Out[1047]:
In [1048]:
plt.scatter(pca.transform(globally_aligned_record)[:,0],pca.transform(globally_aligned_record)[:,1],
c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
Out[1048]:
In [1026]:
perm_set_list = list(set([tuple(p) for p in perms]))
In [1027]:
mapping = dict(zip(list(set([tuple(p) for p in perms])),range(len(perm_set_list))))
In [1028]:
mapped = np.array([mapping[tuple(p)] for p in perms])
In [1030]:
res = plt.hist(mapped,bins=83);
In [1035]:
most_freq_perm = perms[mapped==np.argmax(res[0])][0]
In [665]:
most_freq_perm = perms[mapped==66][0]
most_freq_perm
Out[665]:
In [1036]:
globally_aligned_record = np.zeros(run_record.shape)
for i in range(len(run_record)):
mats = vec_to_mats(run_record[i],D,h,K)
globally_aligned_record[i] = permuted_vector(most_freq_perm,*mats)
In [1041]:
from scipy.spatial import distance
pdist = distance.pdist(globally_aligned_record)#[::10])
In [1042]:
dist_mat = distance.squareform(pdist)
In [1049]:
plt.imshow(dist_mat,interpolation='none',
#norm=ln,
cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-global-sorted-30h.jpg',dpi=600)
In [657]:
plt.plot(mapped==66)
Out[657]:
In [641]:
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(aligned_record[i],aligned_record[best_index]) for i in xrange(len(run_record))])
best_index
Out[641]:
In [642]:
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')
plt.title('Adjusted distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[642]:
In [ ]:
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
In [ ]:
X =
In [ ]:
# can we reduce this to a sorting problem?
In [376]:
sorted(range(len(W2)),key=lambda i:np.mean(W1[:,i]))
Out[376]:
In [396]:
W1.shape
Out[396]:
In [ ]:
# greedy alignment
In [ ]:
In [ ]:
# let's try this for autoencoders!
# initialize parameters randomly
h = 100 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,D)
b2 = np.zeros((1,D))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# keep a record of training progress
params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
num_params = len(params)
num_iter=10000
run_record = np.zeros((num_iter,num_params))
loss_record = np.zeros(num_iter)
# stochastic gradient descent loop
#mini_batch_size = len(X)
mini_batch_size=20
for i in xrange(num_iter):
# minibatch
if mini_batch_size < len(X):
mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
X_ = X[mask]
y_ = y[mask]
else:
X_ = X[:]
y_ = y[:]
num_examples = len(X)#X_.shape[0]
# evaluate class scores, [N x K]
#hidden_layer = tanh(np.dot(X, W) + b)
#hidden_layer = sigmoid(np.dot(X, W) + b)
hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
outputs = np.dot(hidden_layer, W2) + b2
# compute the loss: average cross-entropy loss and regularization
# compute full loss:
compute_full_loss=True
if mini_batch_size == len(X) or compute_full_loss:
#corect_logprobs = -np.log(probs[range(len(X)),y])
data_loss = np.sqrt(np.sum(abs(outputs - X)**2))/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
loss_record[i] = loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y] -= 1
dscores /= num_examples
# compute minibatch loss for gradient purposes:
if mini_batch_size < len(X):
num_examples = X_.shape[0]
hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
corect_logprobs = -np.log(probs[range(num_examples),y_])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y_] -= 1
dscores /= num_examples
# backpropate the gradient to the parameters
# first backprop into parameters W2 and b2
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0, keepdims=True)
# next backprop into hidden layer
dhidden = np.dot(dscores, W2.T)
# backprop the ReLU non-linearity
dhidden[hidden_layer <= 0] = 0
# finally into W,b
dW = np.dot(X_.T, dhidden)
db = np.sum(dhidden, axis=0, keepdims=True)
# add regularization gradient contribution
dW2 += reg * W2
dW += reg * W
# perform a parameter update
W += -step_size * dW
b += -step_size * db
W2 += -step_size * dW2
b2 += -step_size * db2
run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))