modified from http://cs.stanford.edu/people/karpathy/cs231nfiles/minimal_net.html



In [1]:

    
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'spectral'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2



In [2]:

    
np.random.seed(0)
N = 500 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.05,1,N) # radius
  t = np.linspace(j*(K+1),(j+1)*(K+1),N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40,linewidths=0)#, cmap=plt.cm.Spectral)
plt.xlim([-r[-1]*1.1,r[-1]*1.1])
plt.ylim([-r[-1]*1.1,r[-1]*1.1])
#fig.savefig('spiral_raw.png')









    Out[2]:





(-1.1000000000000001, 1.1000000000000001)



In [939]:

    
#Train a Linear Classifier

# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X.shape[0]
for i in xrange(200):
  
  # evaluate class scores, [N x K]
  scores = np.dot(X, W) + b 
  
  # compute the class probabilities
  exp_scores = np.exp(scores)
  probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
  
  # compute the loss: average cross-entropy loss and regularization
  corect_logprobs = -np.log(probs[range(num_examples),y])
  data_loss = np.sum(corect_logprobs)/num_examples
  reg_loss = 0.5*reg*np.sum(W*W)
  loss = data_loss + reg_loss
  if i % 10 == 0:
    print "iteration %d: loss %f" % (i, loss)
  
  # compute the gradient on scores
  dscores = probs
  dscores[range(num_examples),y] -= 1
  dscores /= num_examples
  
  # backpropate the gradient to the parameters (W,b)
  dW = np.dot(X.T, dscores)
  db = np.sum(dscores, axis=0, keepdims=True)
  
  dW += reg*W # regularization gradient
  
  # perform a parameter update
  W += -step_size * dW
  b += -step_size * db









    



iteration 0: loss 1.099513
iteration 10: loss 0.909884
iteration 20: loss 0.842730
iteration 30: loss 0.813568
iteration 40: loss 0.799051
iteration 50: loss 0.791149
iteration 60: loss 0.786576
iteration 70: loss 0.783811
iteration 80: loss 0.782084
iteration 90: loss 0.780979
iteration 100: loss 0.780257
iteration 110: loss 0.779778
iteration 120: loss 0.779457
iteration 130: loss 0.779240
iteration 140: loss 0.779091
iteration 150: loss 0.778989
iteration 160: loss 0.778918
iteration 170: loss 0.778869
iteration 180: loss 0.778835
iteration 190: loss 0.778811



In [940]:

    
# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))









    



training accuracy: 0.52



In [1924]:



In [3]:

    
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
tanh = lambda x: np.tanh(x)



In [4]:

    
import numpy.random as npr



In [1097]:

    
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_linear.png')









    Out[1097]:





30



In [1098]:

    
random_starts = []
for i in range(1000):

    h = 30 # size of hidden layer
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))

    random_starts.append(mats_to_vec(W,b,W2,b2))
random_starts = np.array(random_starts)



In [1099]:

    
pdist = distance.pdist(random_starts)

dist_mat = distance.squareform(pdist)

plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between random parameter initializations')
plt.savefig('l2-pairwise-param-vec-distances-30h-random.jpg',dpi=600)



In [ ]:



In [5]:

    
def train_net(h=30,D=D,K=K,num_iter=10000):
    
    # initialize parameters randomly
    #h = 30 # size of hidden layer
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))
    
    # some hyperparameters
    step_size = 1e-0
    reg = 1e-3 # regularization strength
    
    
    # keep a record of training progress
    params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                        np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
    num_params = len(params)
    
    run_record = np.zeros((num_iter,num_params))
    loss_record = np.zeros(num_iter)
    
    # stochastic gradient descent loop
    #mini_batch_size = len(X)
    mini_batch_size=100
    for i in xrange(num_iter):
      # minibatch
      if mini_batch_size < len(X):
        mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
        X_ = X[mask]
        y_ = y[mask]
      else:
        X_ = X[:]
        y_ = y[:]
      num_examples = len(X)#X_.shape[0]
      # evaluate class scores, [N x K]
      #hidden_layer = tanh(np.dot(X, W) + b)
      #hidden_layer = sigmoid(np.dot(X, W) + b)
      hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
      scores = np.dot(hidden_layer, W2) + b2
      
      # compute the class probabilities
      exp_scores = np.exp(scores)
      probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
      
      # compute the loss: average cross-entropy loss and regularization
      
      # compute full loss:
      compute_full_loss=False
      if mini_batch_size == len(X) or compute_full_loss:
        corect_logprobs = -np.log(probs[range(len(X)),y])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        if i % 1000 == 0:
          print "iteration %d: loss %f" % (i, loss)
        loss_record[i] = loss
        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y] -= 1
        dscores /= num_examples
      
      # compute minibatch loss for gradient purposes:
      if mini_batch_size < len(X):
        num_examples = X_.shape[0]
        hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
        scores = np.dot(hidden_layer, W2) + b2
        
        # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
        corect_logprobs = -np.log(probs[range(num_examples),y_])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y_] -= 1
        dscores /= num_examples
      
      # backpropate the gradient to the parameters
      # first backprop into parameters W2 and b2
      dW2 = np.dot(hidden_layer.T, dscores)
      db2 = np.sum(dscores, axis=0, keepdims=True)
      # next backprop into hidden layer
      dhidden = np.dot(dscores, W2.T)
      # backprop the ReLU non-linearity
      dhidden[hidden_layer <= 0] = 0
      # finally into W,b
      dW = np.dot(X_.T, dhidden)
      db = np.sum(dhidden, axis=0, keepdims=True)
      
      # add regularization gradient contribution
      dW2 += reg * W2
      dW += reg * W
      
      # perform a parameter update
      W += -step_size * dW
      b += -step_size * db
      W2 += -step_size * dW2
      b2 += -step_size * db2
        
      run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                                   np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
    return run_record



In [1105]:

    
run_record1 = train_net()



In [6]:

    
run_records = []



In [7]:

    
# how far away are the endpoints of multiple runs?
from time import time
t = time()

for i in range(10):
    run_records.append(train_net())
    print(i)
    print(time() - t)









    



0
7.26077985764
1
14.3833329678
2
22.0655879974
3
29.1475880146
4
35.9500448704
5
42.864511013
6
49.6595180035
7
56.7538030148
8
63.588244915
9
70.3494689465



In [8]:

    
# what let's do ICA and tICA
from sklearn.decomposition import FastICA



In [96]:

    
ica = FastICA(2)



In [97]:

    
all_runs = np.vstack([run for run in run_records])



In [98]:

    
ica.fit(all_runs[::10])









    Out[98]:





FastICA(algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200,
    n_components=2, random_state=None, tol=0.0001, w_init=None,
    whiten=True)



In [99]:

    
c = np.hstack([np.ones(len(run))*i for i,run in enumerate(run_records)])



In [100]:

    
c.shape









    Out[100]:





(100000,)



In [101]:

    
X_ica = ica.transform(all_runs)[::50]



In [102]:

    
plt.scatter(X_ica[:,0],X_ica[:,1],c=c[::50],linewidths=0)









    Out[102]:





<matplotlib.collections.PathCollection at 0x11b523ad0>



In [103]:

    
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(all_runs)[::50]



In [104]:

    
plt.scatter(X_pca[:,0],X_pca[:,1],c=c[::50],linewidths=0)









    Out[104]:





<matplotlib.collections.PathCollection at 0x1232a47d0>



In [135]:

    
from msmbuilder.decomposition import tICA
tica = tICA(n_components=2,lag_time=100)



In [136]:

    
tica.fit([run for run in run_records])









    Out[136]:





tICA(gamma=0.05, lag_time=100, n_components=2, weighted_transform=False)



In [131]:

    
tica.fit([run_records[0]])









    Out[131]:





tICA(gamma=0.05, lag_time=100, n_components=2, weighted_transform=False)



In [132]:

    
tica.components_.shape









    Out[132]:





(2, 183)



In [137]:

    
X_tica = tica.transform([run_records[0]])[0]



In [140]:

    
X_tica = tica.transform(run_records)



In [142]:

    
c.shape









    Out[142]:





(100000,)



In [145]:

    
for i,x in enumerate(X_tica):
    plt.scatter(x[::50,0],x[::50,1],linewidths=0,c=(np.ones(len(x))*i)[::50])
#plt.scatter(X_tica[:,0],X_tica[:,1],linewidths=0)



In [ ]:



In [105]:

    
start_points = np.array([run_record[0] for run_record in run_records])
pdist = distance.pdist(start_points)
max(pdist),min(pdist)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-105-1bcbd23418aa> in <module>()
      1 start_points = np.array([run_record[0] for run_record in run_records])
----> 2 pdist = distance.pdist(start_points)
      3 max(pdist),min(pdist)

NameError: name 'distance' is not defined



In [1155]:

    
def pairwise_distance_over_time(run_records):
    n = len(run_records[0])
    
    current_points = np.array([run_record[0] for run_record in run_records])
    pdist = distance.pdist(current_points)
    num_dist = len(pdist)
    distances = np.zeros((n,num_dist))
    for i in range(n):
        current_points = np.array([run_record[i] for run_record in run_records])
        pdist = distance.pdist(current_points)
        distances[i] = pdist
    return distances

def plot_pairwise(run_records):
    distances = pairwise_distance_over_time(run_records)
    plt.plot(np.mean(distances,1))
    
    for i in range(distances.shape[1]):
        plt.scatter(range(distances.shape[0]),distances[:,i],
                   linewidths=0,alpha=0.5)



In [1156]:

    
distances = pairwise_distance_over_time(run_records)



In [1160]:

    
plt.plot(np.mean(distances,1))









    Out[1160]:





[<matplotlib.lines.Line2D at 0x4752e5350>]



In [1164]:

    
distances.shape









    Out[1164]:





(10000, 4950)



In [1165]:

    
#lin_dist = distances.reshape(np.prod(distances.shape))



In [1170]:

    
lin_dist=np.zeros(np.prod(distances.shape))
for i in range(len(distances.T)):
    lin_dist[i*distances.shape[0]:(i+1)*distances.shape[0]] = distances[:,i]



In [1166]:

    
iter_num = np.hstack((range(len(distances)) for _ in range(distances.shape[1])))



In [1173]:

    
plt.scatter(iter_num[::100],lin_dist[::100],
            linewidth=0)









    Out[1173]:





<matplotlib.collections.PathCollection at 0x2e2323850>



In [1201]:

    
plt.scatter(iter_num[iter_num<2000][::15],lin_dist[iter_num<2000][::15],
            linewidth=0,alpha=0.005);



In [1188]:

    
dist_vecs = [distance.pdist(run[::10]) for run in run_records]



In [1191]:

    
np.array(dist_vecs).shape









    Out[1191]:





(100, 499500)



In [1192]:

    
avg_dist_vec = np.mean(np.array(dist_vecs),0)



In [1193]:

    
avg_pdist = distance.squareform(avg_dist_vec)



In [1238]:

    
for i,dist_vec in enumerate(dist_vecs):
    plt.imshow(distance.squareform(dist_vec),
               interpolation='none',
               cmap='Blues')
    plt.savefig('distmat {0}.jpg'.format(i),dpi=600)

which examples are most typical and which examples are least typical?



In [ ]:



In [1194]:

    
plt.imshow(avg_pdist,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()









    Out[1194]:





<matplotlib.colorbar.Colorbar instance at 0x512598dd0>



In [1195]:

    
std_dist_vec = np.std(np.array(dist_vecs),0)



In [1196]:

    
std_pdist = distance.squareform(std_dist_vec)



In [1197]:

    
plt.imshow(std_pdist,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()









    Out[1197]:





<matplotlib.colorbar.Colorbar instance at 0x50e6634d0>



In [1207]:

    
plt.hist2d(iter_num[iter_num<2000],lin_dist[iter_num<2000],bins=50,
           cmap='Blues');
plt.colorbar()









    Out[1207]:





<matplotlib.colorbar.Colorbar instance at 0x512412290>



In [ ]:

    
for i in range(distances[:,:100].shape[1]):
    plt.scatter(range(distances.shape[0]),distances[:,i],
               linewidths=0,alpha=0.5)



In [1151]:

    
end_points = np.array([run_record[-1] for run_record in run_records])



In [1152]:

    
end_points.shape









    Out[1152]:





(100, 183)



In [1153]:

    
pdist = distance.pdist(end_points)
max(pdist),min(pdist)









    Out[1153]:





(31.178205827282927, 8.6517482027073012)



In [1217]:

    
run_records_ = np.vstack(run_records)
run_records_.shape









    Out[1217]:





(1000000, 183)



In [1147]:

    
run_records_[::50].shape









    Out[1147]:





(3900, 183)



In [1221]:

    
all_dist = distance.pdist(run_records_[npr.rand(len(run_records_))<0.001])



In [1222]:

    
plt.hist(all_dist,bins=100);



In [1225]:

    
pca = PCA()
pca.fit(run_records_[::10])









    Out[1225]:





PCA(copy=True, n_components=None, whiten=False)



In [1226]:

    
X_ = pca.transform(run_records_)



In [1227]:

    
pca.explained_variance_ratio_[:2]









    Out[1227]:





array([ 0.04538429,  0.03710254])



In [1228]:

    
c = np.hstack([i*np.ones(len(run_records[0])) for i in range(len(run_records))])
c.shape









    Out[1228]:





(1000000,)



In [1229]:

    
c[::thinning].shape









    Out[1229]:





(20000,)



In [1237]:

    
thinning=50
plt.scatter(X_[::thinning,0],X_[::thinning,1],
            c=c[::thinning],
            alpha=0.5,
            linewidths=0)
plt.xlabel('PC1')
plt.ylabel('PC2')









    Out[1237]:





<matplotlib.text.Text at 0x54d7a1a10>



In [1236]:

    
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_[::thinning,0],X_[::thinning,1],X_[::thinning,2],
           alpha=0.5,c=c[::thinning],linewidths=0)









    Out[1236]:





<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x54aa09550>



In [962]:

    
run_record.shape









    Out[962]:





(10000, 183)



In [946]:

    
D,h,K









    Out[946]:





(2, 3, 3)



In [947]:

    
l1 = lambda x,y : sum(abs(x-y))



In [948]:

    
l2 = lambda x,y : np.sqrt(sum((x-y)**2))



In [949]:

    
norm = l2



In [963]:

    
distance_from_start = np.array([norm(run_record[i],run_record[0]) for i in xrange(len(run_record))])



In [964]:

    
plt.rcParams['font.family'] = 'serif'



In [965]:

    
plt.plot(distance_from_start)
plt.hlines(max(distance_from_start),0,len(distance_from_start),linestyle='--')
plt.title('Distance from initial parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')









    Out[965]:





<matplotlib.text.Text at 0x166f2c2d0>



In [999]:

    
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(run_record[i],run_record[best_index]) for i in xrange(len(run_record))])
best_index









    Out[999]:





0



In [760]:

    
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')

plt.title('Distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')









    Out[760]:





<matplotlib.text.Text at 0x161df5150>



In [761]:

    
plt.plot(loss_record)
plt.title('Loss')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.ylim(0,max(loss_record))
plt.vlines(best_index,0,max(loss_record),linestyle='--')









    Out[761]:





<matplotlib.collections.LineCollection at 0x163334810>



In [762]:

    
worst_index = np.argmax(loss_record)
distance_from_worst = np.array([norm(run_record[i],run_record[worst_index]) for i in xrange(len(run_record))])
worst_index









    Out[762]:





9412



In [763]:

    
plt.plot(distance_from_worst)
plt.vlines(worst_index,0,max(distance_from_worst),linestyle='--')

plt.title('Distance from worst parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')









    Out[763]:





<matplotlib.text.Text at 0x16330a810>



In [1001]:

    
# evaluate training set accuracy
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))









    



training accuracy: 0.98



In [1240]:

    
def accuracy(vec,X=X,y=y,D=D,h=h,K=K):
    W,b,W2,b2 = vec_to_mats(vec,D,h,K)
    hidden_layer = np.maximum(0, np.dot(X, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    return (np.mean(predicted_class == y))



In [1332]:

    
def loss(vec,X=X,y=y,D=D,h=h,K=K):
    W,b,W2,b2 = vec_to_mats(vec,D,h,K)
    hidden_layer = np.maximum(0, np.dot(X, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    num_examples = len(X)
    # compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
    corect_logprobs = -np.log(probs[range(num_examples),y])
    data_loss = np.sum(corect_logprobs)/num_examples
    reg_loss=0
    #reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    return data_loss + reg_loss



In [1255]:

    
D









    Out[1255]:





2



In [1256]:

    
run_records[0][0].shape









    Out[1256]:





(183,)



In [1258]:

    
%timeit loss(run_records[0][0])









    



1000 loops, best of 3: 686 µs per loop



In [1260]:

    
interp = lambda initial,final,alpha=0.5: (1-alpha)*initial + alpha*final



In [1372]:

    
dim = len(end_points[0])



In [1373]:

    
alpha= np.arange(-0.2,1.5,0.05)
inds = range(len(start_points))



In [1374]:

    
line_losses = [[loss(interp(start_points[i],end_points[i],a)) for a in alpha] for i in inds]



In [1375]:

    
line_losses = [[loss(interp(np.zeros(dim),end_points[i],a)) for a in alpha] for i in inds]



In [1604]:

    
line_losses = [[loss(interp(np.zeros(dim),run_records[i][-1],a)) for a in alpha] for i in inds]



In [1605]:

    
line_losses = np.array(line_losses)
line_losses.shape









    Out[1605]:





(100, 34)



In [1606]:

    
np.min(line_losses)









    Out[1606]:





0.027062382424285387



In [1607]:

    
plt.plot(line_losses[0]);



In [1608]:

    
best_solution = min([loss(e) for e in end_points])
best_solution









    Out[1608]:





0.077140585108297591



In [1615]:

    
[plt.plot(alpha,line_losses[i],alpha=0.3) for i in range(100)];
plt.hlines(best_solution,alpha[0],alpha[-1],linestyle='--',color='grey')
plt.vlines(1,0,np.max(line_losses),linestyle='--',color='grey')
plt.xlabel('alpha')
plt.ylabel('loss')









    Out[1615]:





<matplotlib.text.Text at 0x53c473910>



In [1370]:

    
np.mean(abs(end_points))









    Out[1370]:





0.78300038541779071



In [1371]:

    
np.mean(abs(npr.randn(1000)))









    Out[1371]:





0.80506696282332135

hmm so what does this look like if you pick a random direction? or take one good network and scramble its weight vector? or interpolate between two good networks?



In [1895]:

    
rand_end_points = npr.randn(10000,dim)



In [1918]:

    
line_losses = [[loss(interp(np.zeros(dim),rand_end_points[i],alpha)) for alpha in np.arange(-1,1,0.1)] for i in range(10000)]



In [1904]:

    
[plt.plot(line) for line in line_losses[:10]];



In [1921]:

    
np.min(line_losses),np.max(line_losses)









    Out[1921]:





(0.78421095702561372, 19.994141412327814)



In [1902]:

    
np.array(line_losses).shape









    Out[1902]:





(10, 20)



In [1922]:

    
rand_min_losses = np.array([np.min(l) for l in line_losses])



In [1924]:

    
plt.hist(rand_min_losses,bins=50,normed=True,log=True);
plt.vlines(best_solution,0,130.0,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(np.min(rand_min_losses),0,130.0,linestyle='--',color='blue',label='Best random direction')
plt.legend(loc='best')
plt.xlabel('Loss')
plt.ylabel('Probability density')
plt.title('Line search in random directions doesn\'t work')









    Out[1924]:





<matplotlib.text.Text at 0x54b5d4610>



In [1416]:

    
scrambled_end_points = [end_points[0][sorted(range(dim),key=lambda i:npr.rand())] for _ in range(100)]



In [1418]:

    
np.min(line_losses)









    Out[1418]:





0.91941408732429475



In [1423]:

    
end_points.shape









    Out[1423]:





(100, 183)



In [1428]:

    
best_ind = np.argmin(np.array([loss(e) for e in end_points]))
best_ind









    Out[1428]:





55



In [1429]:

    
loss(end_points[best_ind])









    Out[1429]:





0.077140585108297591



In [1437]:

    
line_losses = np.array([[loss(interp(np.zeros(dim),scrambled_end_points[best_ind],a)) for a in alpha] for i in inds])



In [1454]:

    
plt.bar(range(len(end_points.T)),sorted(np.mean(end_points,0)));



In [1482]:

    
stats.ttest_1samp(end_points[:,i],0)[0]









    Out[1482]:





2.954386291869936e-21



In [1486]:

    
from scipy import stats

sig = np.zeros(end_points.shape[1])
for i in range(end_points.shape[1]):
    #mean,(low,up) = stats.bayes_mvs(end_points[:,i],alpha=0.95)[0]
    ##sig[i] = (low*up>0)*abs(mean / (up-low))
    sig[i]=abs(stats.ttest_1samp(end_points[:,i],0)[0])



In [1487]:

    
plt.plot(sig)









    Out[1487]:





[<matplotlib.lines.Line2D at 0x544526f50>]



In [1498]:

    
mats=vec_to_mats(sig,D,h,K)



In [1504]:

    
fig, axes = plt.subplots(nrows=2, ncols=2)
for i,ax in enumerate(axes.flat):
    im = ax.imshow(mats[i],cmap='Blues')

fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im,cax=cbar_ax)#, cax=cbar_ax)









    Out[1504]:





<matplotlib.colorbar.Colorbar instance at 0x4d05f6128>

Hmm so it look like the biases learn consistent values across multiple runs



In [1458]:

    
end_points[:,0].shape









    Out[1458]:





(100,)



In [1444]:

    
plt.hist([l1(np.zeros(dim),e) for e in end_points],bins=50);



In [1506]:

    
plt.hist([l2(np.zeros(dim),e) for e in end_points],bins=100);



In [1616]:

    
aligned_end_points = heuristic_alignment(end_points,D,h,K)



In [1618]:

    
raw_distances = distance.pdist(end_points)
aligned_distances = distance.pdist(aligned_end_points)



In [1619]:

    
raw_distances.mean(),aligned_distances.mean()









    Out[1619]:





(24.170295033505681, 20.79432166590934)



In [1620]:

    
raw_distances.min(),aligned_distances.min()









    Out[1620]:





(8.6517482027073012, 4.7948656369942562)



In [1510]:

    
plt.hist([l2(np.zeros(dim),18*npr.randn(dim)/14) for _ in range(1000)],bins=100);



In [1438]:

    
plt.plot(line_losses[10])









    Out[1438]:





[<matplotlib.lines.Line2D at 0x486ab3c50>]

compared with the angle from start to finish, what is the angle from start to current?



In [1511]:

    
start_to_f = [e - i for (e,i) in zip(end_points,start_points)]



In [1522]:

    
e = end_points[0]
e.dot(-e)/(np.sqrt(sum(e**2))*np.sqrt(sum(e**2)))









    Out[1522]:





-1.0000000000000004



In [1576]:

    
angle = lambda x,y:x.dot(y)/(np.sqrt(sum(x**2))*np.sqrt(sum(y**2)))



In [1534]:

    
r = run_records[0]



In [1539]:

    
plt.plot(angle(start_to_f[0],r.T))









    Out[1539]:





[<matplotlib.lines.Line2D at 0x4d0a91c10>]



In [1784]:

    
[plt.plot(angle(start_to_f[i],run_records[i].T),alpha=0.5) for i in range(100)];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity w.r.t. final')









    Out[1784]:





<matplotlib.text.Text at 0x53e47aa90>



In [1554]:

    
angle(end_points[40],end_points[41])









    Out[1554]:





0.042579906747210766



In [1563]:

    
run_records[0].shape









    Out[1563]:





(10000, 183)



In [1572]:

    
run_records[0][:,:-lag].dot(run_records[0][:,:-lag].T).shape









    Out[1572]:





(10000, 10000)



In [1586]:

    
def lag_angle(record,lag=10):
    n = len(record) - lag
    angles = np.zeros(n)
    for i in range(n):
        angles[i] = angle(record[i],record[i+lag])
    return angles



In [1587]:

    
run_records[:10][0].shape









    Out[1587]:





(10000, 183)



In [1783]:

    
lag=200
[plt.plot(lag_angle(r[:1200],lag),alpha=0.5) for r in run_records];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity at lag {0}'.format(lag))









    Out[1783]:





<matplotlib.text.Text at 0x50816af50>



In [ ]:



In [1601]:

    
cosine_distances = distance.pdist(end_points,'cosine')-1
distance.squareform(cosine_distances).shape









    Out[1601]:





(100, 100)



In [1602]:

    
np.mean(cosine_distances),np.min(cosine_distances),np.max(cosine_distances)









    Out[1602]:





(-0.028611133199068644, -0.57503860316258859, 0.37360094254637466)



In [ ]:



In [1600]:

    
distance.cosine(np.ones(200),np.ones(200))









    Out[1600]:





1.1102230246251565e-16



In [1559]:

    
plt.imshow(distance.squareform(distance.pdist(r[::10],'cosine')),cmap='Blues')









    Out[1559]:





<matplotlib.image.AxesImage at 0x50f13b650>

How sensitive is this result to the angle? i.e. $\left|\frac{\partial \min J}{\partial \theta}\right|$?

Now it's convenient to define a loss function in terms of just the direction, since the minimum can be found easily using a coarse line search

Line loss



In [1738]:

    
min_line_loss = lambda vec,loss=loss,alpha=np.arange(0.0,1.5,0.075):np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])



In [1648]:

    
loss_as_f_alpha = lambda vec,loss=loss:lambda alpha:loss(alpha*vec)



In [ ]:



In [1656]:

    
from scipy import optimize
direction = end_points[best_ind]
loss_in_best_dir = loss_as_f_alpha(direction)
optimize.fmin(loss_as_f_alpha(direction),1.0)









    



Optimization terminated successfully.
         Current function value: 0.059242
         Iterations: 12
         Function evaluations: 24






    Out[1656]:





array([ 1.16894531])



In [1674]:

    
t = time()
optimize.fmin(loss_in_best_dir,1.0,maxfun=30,disp=0)
print(time() - t)









    



0.0413529872894



In [1645]:

    
%timeit min_line_loss(npr.randn(dim))









    



100 loops, best of 3: 12.2 ms per loop



In [1646]:

    
min_line_loss(end_points[best_ind])









    Out[1646]:





0.059259347625614918



In [1631]:

    
def gradient(func,x0,h=0.001):
    x0 = np.array(x0)#,dtype=float)
    y = func(x0)
    deriv = np.zeros(len(x0))
    for i in range(len(x0)):
        x = np.array(x0)
        x[i] += h
        deriv[i] = (func(x) - y)/h
    return deriv



In [1635]:

    
%timeit gradient(min_line_loss,end_points[best_ind])









    



1 loops, best of 3: 1.51 s per loop



In [1638]:

    
new_dir = end_points[best_ind] - gradient(min_line_loss,end_points[best_ind])



In [1675]:

    
min_line_loss_scipy = lambda vec,loss=loss:optimize.fmin(loss_as_f_alpha(vec),1.0,maxfun=30,disp=0)



In [1677]:

    
gradient(min_line_loss_scipy,end_points[best_ind])[:10]









    Out[1677]:





array([ 0.        ,  0.09765625,  0.        ,  0.09765625,  0.09765625,
        0.        ,  0.        ,  0.        ,  0.09765625,  0.09765625])



In [1678]:

    
min_line_loss(new_dir)









    Out[1678]:





0.043117361081621064



In [1687]:

    
npr.seed(0)


new_dir = npr.randn(dim)
directions = [new_dir]
losses = [min_line_loss(new_dir)]
for i in range(10):
    new_dir -= 100*gradient(min_line_loss,new_dir)
    directions.append(new_dir)
    losses.append(min_line_loss(new_dir))
    print(losses[-1])









    



1.09867824926
1.09863392837
1.09850089274
1.08799917902
1.07802230694
1.06780542019
1.05722111163
1.04630885214
1.03490924526
1.02280603572



In [1683]:

    
sum(abs(new_dir))









    Out[1683]:





152.44633840588705



In [1840]:

    
def adaptive_metropolis_hastings(func,x0,num_steps=1000,step_size=0.1,verbose=False):
    locs = np.zeros((num_steps+1,len(x0)))
    vals = np.zeros(num_steps+1)
    locs[0] = x0
    vals[0] = func(x0)
    num_rejects=0
    num_accepts=0
    message = ''
    adaptation_rate=1.02
    steps = np.zeros(num_steps+1)
    steps[0] = step_size
    for i in range(num_steps):
        new_loc = locs[i] + npr.randn(len(locs[i]))*step_size
        new_val = func(new_loc)
        #accept = npr.rand() < np.exp(-(new_val - vals[i]))#/vals[i]#np.exp(-(new_val - vals[i]))
        accept = new_val <= vals[i]#*(1+npr.randn()*0.01)# or (npr.rand() < 0.05)
        if not accept:
            new_loc = locs[i]
            new_val = vals[i]
            num_rejects+=1
            num_accepts=0
            if num_rejects > 10:
                if step_size > 1e-3:
                    step_size/=adaptation_rate
                message = message+'\n reduced step size'
        else:
            num_rejects=0
            num_accepts+=1
            if num_accepts > 10:
                if step_size < 1.0:
                    step_size*=adaptation_rate
                message = message+ '\n increased step size'
        locs[i+1] = new_loc
        vals[i+1] = new_val
        if i % 100 ==0:
            print("{0}: {1:.3}".format(i,new_val))
            if verbose:
                print(message)
            message = ''
        steps[i+1] = step_size
    return locs,vals,steps



In [1841]:

    
np.arange(0.0,1.5,0.1).shape









    Out[1841]:





(15,)



In [1842]:

    
def min_line_loss(vec,loss=loss,alpha=np.arange(0.0,1.5,0.1)):
    return np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])



In [1843]:

    
locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind],num_steps=2000,step_size=0.01)
#+npr.randn(dim)*0.01









    



0: 0.0577
100: 0.0267
200: 0.0223
300: 0.0213
400: 0.0209
500: 0.0203
600: 0.0198
700: 0.0195
800: 0.0192
900: 0.0187
1000: 0.0183
1100: 0.0178
1200: 0.0174
1300: 0.0171
1400: 0.0169
1500: 0.0165
1600: 0.0163
1700: 0.0162
1800: 0.0161
1900: 0.0159



In [1865]:

    
plt.plot(vals)
plt.hlines(vals[0],0,len(vals),linestyle='--',color='grey')
plt.ylabel('loss')
plt.xlabel('Hill-climbing step')









    Out[1865]:





<matplotlib.text.Text at 0x548bc0c10>



In [1845]:

    
plt.plot(steps)









    Out[1845]:





[<matplotlib.lines.Line2D at 0x547476c10>]



In [1810]:

    
np.mean(abs(end_points[best_ind]))









    Out[1810]:





0.80133638136063834



In [1811]:

    
np.mean(abs(npr.randn(dim)))









    Out[1811]:





0.91569353934038389



In [1849]:

    
random_directions=npr.randn(100,dim)



In [1850]:

    
best_rand = np.argmin([min_line_loss(r) for r in random_directions])



In [1886]:

    
multiple_hc_runs_near = []
for i in range(10):
    locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+npr.randn(dim)*0.5,num_steps=1000,step_size=0.01)
    multiple_hc_runs_near.append((locs,vals,steps))
#locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+0.5*npr.randn(dim),num_steps=5000,step_size=0.01)









    



0: 0.69
100: 0.158
200: 0.0678
300: 0.0472
400: 0.0407
500: 0.0361
600: 0.0322
700: 0.0296
800: 0.0278
900: 0.027
0: 1.0
100: 0.327
200: 0.179
300: 0.111
400: 0.0713
500: 0.054
600: 0.0413
700: 0.0392
800: 0.0347
900: 0.031
0: 0.922
100: 0.565
200: 0.241
300: 0.102
400: 0.0694
500: 0.053
600: 0.0472
700: 0.0451
800: 0.0422
900: 0.0396
0: 0.852
100: 0.697
200: 0.508
300: 0.309
400: 0.164
500: 0.0883
600: 0.0545
700: 0.0423
800: 0.034
900: 0.0302
0: 0.72
100: 0.573
200: 0.448
300: 0.292
400: 0.144
500: 0.0597
600: 0.0444
700: 0.0357
800: 0.0316
900: 0.0284
0: 0.599
100: 0.273
200: 0.132
300: 0.0689
400: 0.0537
500: 0.0465
600: 0.0404
700: 0.0355
800: 0.0335
900: 0.0314
0: 0.992
100: 0.856
200: 0.639
300: 0.37
400: 0.203
500: 0.112
600: 0.086
700: 0.0741
800: 0.0683
900: 0.0639
0: 0.657
100: 0.441
200: 0.299
300: 0.193
400: 0.126
500: 0.08
600: 0.0526
700: 0.0413
800: 0.0323
900: 0.0285
0: 0.936
100: 0.817
200: 0.666
300: 0.5
400: 0.353
500: 0.212
600: 0.132
700: 0.0713
800: 0.0552
900: 0.0448
0: 0.508
100: 0.288
200: 0.161
300: 0.093
400: 0.0577
500: 0.0417
600: 0.0349
700: 0.0303
800: 0.0267
900: 0.0257



In [1879]:

    
# random initialization
for soln in multiple_hc_runs[:-1]:
    vals = soln[1]
    plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Random initialization')









    Out[1879]:





<matplotlib.text.Text at 0x53e15ec10>



In [1887]:

    
for soln in multiple_hc_runs_near[:-1]:
    vals = soln[1]
    plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs_near[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Initialized near SGD solution')









    Out[1887]:





<matplotlib.text.Text at 0x5400cc610>



In [1890]:

    
hc_solns = np.array([run[0][-1] for run in multiple_hc_runs_near])



In [1892]:

    
hc_angles = []
for i in range(len(hc_solns)):
    for j in range(i):
        if i!=j:
            hc_angles.append(angle(hc_solns[i],hc_solns[j]))
hc_angles = np.array(hc_angles)



In [1893]:

    
plt.hist(hc_angles,bins=20)



In [1864]:

    
plt.plot(steps)









    Out[1864]:





[<matplotlib.lines.Line2D at 0x548beffd0>]



In [1684]:

    
sum(abs(gradient(min_line_loss,new_dir)))









    Out[1684]:





0.0061784101530459878



In [1634]:

    
plt.plot(sorted(gradient(min_line_loss,end_points[best_ind])))









    Out[1634]:





[<matplotlib.lines.Line2D at 0x53cb146d0>]



In [1050]:

    
from scipy.spatial import distance
pdist = distance.pdist(run_record)



In [1051]:

    
dist_mat = distance.squareform(pdist)



In [1052]:

    
from matplotlib import colors
ln = colors.LogNorm()



In [1055]:

    
plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-3h-3-class-normed.jpg',dpi=600)



In [959]:

    
def pairwise_distances(X):
    pdist=np.zeros((len(X),len(X)))
    for i in range(len(X)):
        for j in range(len(X)):
            #
            pbc[i,j] = BC_jit(X[i],X[j])
    return pbc



In [1000]:

    
W,b,W2,b2 = vec_to_mats(run_record[-1],D,h,K)

# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(np.maximum(0, np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b), W2) + b2
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Greys, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y,linewidths=0)#, s=40)#, cmap=plt.cm.Greys)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_net.png')









    Out[1000]:





(-1.0509244727707623, 1.0690755272292396)

Desiderata:

Permutation invariance

For a fully-connected feed forward neural network with 1 hidden layer, many weight vectors correspond to networks that compute the same function, since the hidden units can be permuted without any effect on the output. I.e. the columns of the incoming weight matrix can be permuted arbitrarily, as long as the the rows of the outgoing weight matrix are also permuted. For a hidden layer with $N$ hidden units, there are $N!$ possible permutations. We would like a distance measure over neural networks to be invariant to permutation of the hidden units.



In [359]:

    
in_dim = 2
hidden_dim = 10
out_dim = 2

W1 = npr.randn(in_dim,hidden_dim)
W2 = npr.randn(hidden_dim,out_dim)



In [362]:

    
npr.randn(2).dot(W1)









    Out[362]:





array([ 0.80883726, -1.70239477,  1.81801994, -2.51101393,  1.85871958,
       -1.2538666 ,  3.14945952, -0.33905906,  3.24302743,  0.06636554])



In [363]:

    
from scipy.misc import comb



In [369]:

    
np.prod(range(1,20))









    Out[369]:





121645100408832000



In [374]:

    
np.prod(W1.shape) + np.prod(W2.shape)









    Out[374]:





40



In [375]:

    
parameter_vector = np.hstack((W1.reshape(np.prod(W1.shape)),W2.reshape(np.prod(W2.shape))))



In [398]:

    
def mats_to_vec(W,b,W2,b2):
    
    return np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                      np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))



In [412]:

    
def vec_to_mats(param_vector,D,h,K):
    ''' D is input dimensionality, h is number of hidden units, K is number of output units '''
    
    index = 0
    W = param_vector[index:index + D*h].reshape((D,h))
    index += D*h
    b = param_vector[index:index+h].reshape((1,h))
    index += h
    W2 = param_vector[index:index+h*K].reshape((h,K))
    index+= h*K
    b2 = param_vector[index:index +K].reshape((1,K))    
    return W,b,W2,b2



In [770]:

    
mats = vec_to_mats(run_record[0],D,h,K)
W1,b1,W2,b2 = mats



In [771]:

    
W1.shape,b1.shape,W2.shape,b2.shape









    Out[771]:





((2, 0), (1, 0), (0, 3), (1, 3))



In [533]:

    
def find_permutation(mats):
    ''' simple heuristic, permute according to the column-wise mean values of W1'''
    #return range(W1.shape[1])
    W1,b1,W2,b2 = mats
    #return sorted(range(W1.shape[1]),key=lambda i:b1[0,i])
    #return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i])+np.max(W2[i,:]))
    #return sorted(range(W1.shape[1]),key=lambda i:np.max(W2[i,:]))
    return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i]))



In [534]:

    
perm = find_permutation(mats)



In [535]:

    
def permuted_vector(perm,W1,b1,W2,b2):
    D,h = W1.shape
    K = b2.shape[1]
    #perm = find_permutation(W1)
    return mats_to_vec(W1[:,perm],b1[:,perm],W2[perm,:],b2)



In [536]:

    
def heuristic_alignment(record,D,h,K):
    perm_record = np.zeros(record.shape)
    for i in range(len(record)):
        mats = vec_to_mats(record[i],D,h,K)
        perm_record[i] = permuted_vector(find_permutation(mats),*mats)
    return perm_record



In [1056]:

    
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(run_record)

plt.plot(pca.explained_variance_ratio_[:20])









    Out[1056]:





[<matplotlib.lines.Line2D at 0x3e627eed0>]



In [1093]:

    
plt.plot(pca.transform(run_record)[:,0],label='PC1',color='blue',
         linewidth=3)
plt.plot(pca.transform(run_record)[:,1],label='PC2',color='blue',
         linewidth=2,alpha=0.7)
plt.plot(pca.transform(run_record)[:,2],label='PC3',color='blue',
         alpha=0.5)
plt.plot(pca.transform(run_record)[:,3],label='PC4',color='blue',
         linestyle='--',alpha=0.5)
plt.plot(pca.transform(run_record)[:,4],label='PC5',color='blue',
         linestyle='--',alpha=0.25)
plt.plot(pca.transform(run_record)[:,5],label='PC6+',color='grey',
         linestyle='--',alpha=0.1)
for i in range(6,10):
    plt.plot(pca.transform(run_record)[:,i],color='grey',
             linestyle='--',alpha=0.1)
plt.xlabel('Iteration #')
plt.legend(loc='best')
plt.title('Training dynamics projected onto leading principal components')









    Out[1093]:





<matplotlib.text.Text at 0x46eb24810>



In [ ]:



In [ ]:



In [1086]:

    
reduced_dynamics = pca.transform(run_record)[:,:10]



In [973]:

    
sum(pca.explained_variance_ratio_[:3])









    Out[973]:





0.96540130610302688



In [974]:

    
run_record.shape









    Out[974]:





(10000, 183)



In [975]:

    
plt.plot(abs(pca.components_[1]))









    Out[975]:





[<matplotlib.lines.Line2D at 0x183078d50>]



In [976]:

    
plt.plot(sorted(abs(pca.components_[0])))









    Out[976]:





[<matplotlib.lines.Line2D at 0x182131cd0>]



In [977]:

    
sum(abs(pca.components_[0])>0.05)









    Out[977]:





54



In [978]:

    
plt.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],
            c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()









    Out[978]:





<matplotlib.colorbar.Colorbar instance at 0x163691fc8>



In [979]:

    
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],pca.transform(run_record)[:,2],
            c=range(num_iter),linewidths=0,alpha=0.5)
#plt.colorbar()









    Out[979]:





<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x2e46cccd0>



In [989]:

    
D=2



In [990]:

    
aligned_record = heuristic_alignment(run_record,D,h,K)



In [1009]:

    
from scipy.spatial import distance
pdist = distance.pdist(aligned_record)



In [1010]:

    
dist_mat = distance.squareform(pdist)



In [1011]:

    
plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-sorted-3h-3-class.jpg',dpi=600)



In [1020]:

    
h=30



In [1021]:

    
perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in run_record]



In [ ]:

    
def find_most_common_perm(record, D,h,K):
    perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in record]



In [1022]:

    
perms = np.array(perms)



In [1023]:

    
len([p for p in perms])









    Out[1023]:





10000



In [1024]:

    
len(set([tuple(p) for p in perms]))









    Out[1024]:





1587



In [1025]:

    
plt.plot(sorted(np.std(perms,0)))









    Out[1025]:





[<matplotlib.lines.Line2D at 0x373990b90>]



In [1012]:

    
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(aligned_record)

plt.plot(pca.explained_variance_ratio_[:20])









    Out[1012]:





[<matplotlib.lines.Line2D at 0x372bdfd50>]



In [1013]:

    
sum(pca.explained_variance_ratio_[:2])









    Out[1013]:





0.57612004279971307



In [1014]:

    
plt.scatter(pca.transform(aligned_record)[:,0],pca.transform(aligned_record)[:,1],
            c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()









    Out[1014]:





<matplotlib.colorbar.Colorbar instance at 0x373962200>



In [1045]:

    
pca.fit(globally_aligned_record)









    Out[1045]:





PCA(copy=True, n_components=None, whiten=False)



In [1047]:

    
sum(pca.explained_variance_ratio_[:3])









    Out[1047]:





0.96540130610302688



In [1048]:

    
plt.scatter(pca.transform(globally_aligned_record)[:,0],pca.transform(globally_aligned_record)[:,1],
            c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()









    Out[1048]:





<matplotlib.colorbar.Colorbar instance at 0x162c0bcf8>



In [1026]:

    
perm_set_list = list(set([tuple(p) for p in perms]))



In [1027]:

    
mapping = dict(zip(list(set([tuple(p) for p in perms])),range(len(perm_set_list))))



In [1028]:

    
mapped = np.array([mapping[tuple(p)] for p in perms])



In [1030]:

    
res = plt.hist(mapped,bins=83);



In [1035]:

    
most_freq_perm = perms[mapped==np.argmax(res[0])][0]



In [665]:

    
most_freq_perm = perms[mapped==66][0]
most_freq_perm









    Out[665]:





array([9, 0, 5, 1, 6, 8, 4, 2, 3, 7])



In [1036]:

    
globally_aligned_record = np.zeros(run_record.shape)
for i in range(len(run_record)):
    mats = vec_to_mats(run_record[i],D,h,K)
    globally_aligned_record[i] = permuted_vector(most_freq_perm,*mats)



In [1041]:

    
from scipy.spatial import distance
pdist = distance.pdist(globally_aligned_record)#[::10])



In [1042]:

    
dist_mat = distance.squareform(pdist)



In [1049]:

    
plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-global-sorted-30h.jpg',dpi=600)



In [657]:

    
plt.plot(mapped==66)









    Out[657]:





[<matplotlib.lines.Line2D at 0x154463050>]



In [641]:

    
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(aligned_record[i],aligned_record[best_index]) for i in xrange(len(run_record))])
best_index









    Out[641]:





7460



In [642]:

    
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')

plt.title('Adjusted distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')









    Out[642]:





<matplotlib.text.Text at 0x153baba90>



In [ ]:

    
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))



In [ ]:

    
X =



In [ ]:

    
# can we reduce this to a sorting problem?



In [376]:

    
sorted(range(len(W2)),key=lambda i:np.mean(W1[:,i]))









    Out[376]:





[7, 8, 4, 5, 6, 0, 9, 2, 1, 3]



In [396]:

    
W1.shape









    Out[396]:





(2, 10)



In [ ]:

    
# greedy alignment



In [ ]:



In [ ]:

    
# let's try this for autoencoders!

# initialize parameters randomly
h = 100 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,D)
b2 = np.zeros((1,D))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength


# keep a record of training progress
params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                    np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
num_params = len(params)
num_iter=10000
run_record = np.zeros((num_iter,num_params))
loss_record = np.zeros(num_iter)

# stochastic gradient descent loop
#mini_batch_size = len(X)
mini_batch_size=20
for i in xrange(num_iter):
  # minibatch
  if mini_batch_size < len(X):
    mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
    X_ = X[mask]
    y_ = y[mask]
  else:
    X_ = X[:]
    y_ = y[:]
  num_examples = len(X)#X_.shape[0]
  # evaluate class scores, [N x K]
  #hidden_layer = tanh(np.dot(X, W) + b)
  #hidden_layer = sigmoid(np.dot(X, W) + b)
  hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
  outputs = np.dot(hidden_layer, W2) + b2
  
  
  # compute the loss: average cross-entropy loss and regularization
  
  # compute full loss:
  compute_full_loss=True
  if mini_batch_size == len(X) or compute_full_loss:
    
    #corect_logprobs = -np.log(probs[range(len(X)),y])
    data_loss = np.sqrt(np.sum(abs(outputs - X)**2))/num_examples
    reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    loss = data_loss + reg_loss
    if i % 1000 == 0:
      print "iteration %d: loss %f" % (i, loss)
    loss_record[i] = loss
    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y] -= 1
    dscores /= num_examples
  
  # compute minibatch loss for gradient purposes:
  if mini_batch_size < len(X):
    num_examples = X_.shape[0]
    hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
    scores = np.dot(hidden_layer, W2) + b2
    
    # compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
    corect_logprobs = -np.log(probs[range(num_examples),y_])
    data_loss = np.sum(corect_logprobs)/num_examples
    reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    loss = data_loss + reg_loss
    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y_] -= 1
    dscores /= num_examples
  
  # backpropate the gradient to the parameters
  # first backprop into parameters W2 and b2
  dW2 = np.dot(hidden_layer.T, dscores)
  db2 = np.sum(dscores, axis=0, keepdims=True)
  # next backprop into hidden layer
  dhidden = np.dot(dscores, W2.T)
  # backprop the ReLU non-linearity
  dhidden[hidden_layer <= 0] = 0
  # finally into W,b
  dW = np.dot(X_.T, dhidden)
  db = np.sum(dhidden, axis=0, keepdims=True)
  
  # add regularization gradient contribution
  dW2 += reg * W2
  dW += reg * W
  
  # perform a parameter update
  W += -step_size * dW
  b += -step_size * db
  W2 += -step_size * dW2
  b2 += -step_size * db2
    
  run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                               np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))