In [1]:
print ("Notebook 1: Why Is ML Difficult?")
#This is Python Notebook to walk through polynomial regression examples
#We will use this to think about regression
import numpy as np
import sklearn as sk
from sklearn import datasets, linear_model
from sklearn.preprocessing import PolynomialFeatures
import matplotlib as mpl
from matplotlib import pyplot as plt
#import ml_style as sty # optional style file, can be commented
#mpl.rcParams.update(sty.style) # optional style file, can be commented
%matplotlib notebook
# The Training Data
N_train=100
sigma_train=1;
# Train on integers
x=np.linspace(0.05,0.95,N_train)
# Draw random noise
s = sigma_train*np.random.randn(N_train)
#linear
y=2*x+s
#Tenth Order
#y=2*x-10*x**5+15*x**10+s
p1=plt.plot(x,y, "o",ms=15, label='Training')
#Linear Regression
# Create linear regression object
clf = linear_model.LinearRegression()
# Train the model using the training sets
clf.fit(x[:, np.newaxis], y)
# The coefficients
xplot=np.linspace(0.02,0.98,200)
linear_plot=plt.plot(xplot, clf.predict(xplot[:, np.newaxis]),label='Linear')
#Polynomial Regression
poly3 = PolynomialFeatures(degree=3)
X = poly3.fit_transform(x[:,np.newaxis])
clf3 = linear_model.LinearRegression()
clf3.fit(X,y)
Xplot=poly3.fit_transform(xplot[:,np.newaxis])
poly3_plot=plt.plot(xplot, clf3.predict(Xplot), label='Poly 3')
#poly5 = PolynomialFeatures(degree=5)
#X = poly5.fit_transform(x[:,np.newaxis])
#clf5 = linear_model.LinearRegression()
#clf5.fit(X,y)
#Xplot=poly5.fit_transform(xplot[:,np.newaxis])
#plt.plot(xplot, clf5.predict(Xplot), 'r--',linewidth=1)
poly10 = PolynomialFeatures(degree=10)
X = poly10.fit_transform(x[:,np.newaxis])
clf10 = linear_model.LinearRegression()
clf10.fit(X,y)
Xplot=poly10.fit_transform(xplot[:,np.newaxis])
poly10_plot=plt.plot(xplot, clf10.predict(Xplot), label='Poly 10')
axes = plt.gca()
axes.set_ylim([-7,7])
handles, labels=axes.get_legend_handles_labels()
plt.legend(handles,labels, loc='lower center')
plt.xlabel("$x$")
plt.ylabel("$y$")
Title="$N=$"+str(N_train)+", $\sigma=$"+str(sigma_train)
plt.title(Title+" (train)")
plt.tight_layout()
plt.show()
#Linear Filename
filename_train=Title+"train-linear.pdf"
#Tenth Order Filename
#filename_train=Title+"train-o10.pdf"
plt.savefig(filename_train)
# Generate Test Data
#Number of test data
N_test=20
sigma_test=sigma_train
max_x=1.2
x_test=max_x*np.random.random(N_test)
# Draw random noise
s_test = sigma_test*np.random.randn(N_test)
#Linear
y_test=2*x_test+s_test
#Tenth order
#y_test=2*x_test-10*x_test**5+15*x_test**10+s_test
#Make design matrices for prediction
x_plot=np.linspace(0,max_x, 200)
X3 = poly3.fit_transform(x_plot[:,np.newaxis])
X10 = poly10.fit_transform(x_plot[:,np.newaxis])
%matplotlib notebook
fig = plt.figure()
p1=plt.plot(x_test,y_test.transpose(), 'o', ms=12, label='data')
p2=plt.plot(x_plot,clf.predict(x_plot[:,np.newaxis]), label='linear')
p3=plt.plot(x_plot,clf3.predict(X3), label='3rd order')
p10=plt.plot(x_plot,clf10.predict(X10), label='10th order')
plt.legend(loc=2)
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.legend(loc='best')
plt.title(Title+" (pred.)")
plt.tight_layout()
plt.show()
#Linear Filename
filename_test=Title+"pred-linear.pdf"
#Tenth Order Filename
#filename_test=Title+"pred-o10.pdf"
plt.savefig(filename_test)
plt.ylim((-6,12))
Out[1]:
In [5]:
print ("Notebook 2: Gradient Descent")
#This cell sets up basic plotting functions awe
#we will use to visualize the gradient descent routines.
#Make plots inline
%matplotlib inline
#Make 3D plots
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
#from matplotlib import animation
#from IPython.display import HTML
from matplotlib.colors import LogNorm
#from itertools import zip_longest
#Import Numpy
import numpy as np
#Define function for plotting
def plot_surface(x,y,z,azim=-60,elev=40, dist=10, cmap="RdYlBu_r"):
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
plot_args = {'rstride': 1, 'cstride': 1, 'cmap':cmap,
'linewidth': 20, 'antialiased': True,
'vmin': -2, 'vmax': 2}
ax.plot_surface(x, y, z, **plot_args)
ax.view_init(azim=azim, elev=elev)
ax.dist=dist
ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)
ax.set_zlim(-2, 2)
plt.xticks([-1, -0.5, 0, 0.5, 1],
[r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"])
plt.yticks([-1, -0.5, 0, 0.5, 1],
[r"$-1$", r"$-1/2$", r"$0$", r"$1/2$", r"$1$"])
ax.set_zticks([-2, -1, 0, 1, 2])
ax.set_zticklabels([r"$-2$", r"$-1$", r"$0$", r"$1$", r"$2$"])
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.set_xlabel(r"$x$", fontsize=18)
ax.set_ylabel(r"$y$", fontsize=18)
ax.set_zlabel(r"z", fontsize=18)
return fig,ax;
def overlay_trajectory_quiver(ax,obj_func,trajectory, color='k'):
xs=trajectory[:,0]
ys=trajectory[:,1]
zs=obj_func(xs,ys)
ax.quiver(xs[:-1], ys[:-1], zs[:-1], xs[1:]-xs[:-1], ys[1:]-ys[:-1],zs[1:]-zs[:-1],color=color,arrow_length_ratio=0.3)
return ax;
def overlay_trajectory(ax,obj_func,trajectory,label,color='k'):
xs=trajectory[:,0]
ys=trajectory[:,1]
zs=obj_func(xs,ys)
ax.plot(xs,ys,zs, color, label=label)
return ax;
def overlay_trajectory_contour_M(ax,trajectory, label,color='k',lw=2):
xs=trajectory[:,0]
ys=trajectory[:,1]
ax.plot(xs,ys, color, label=label,lw=lw)
ax.plot(xs[-1],ys[-1],color+'>', markersize=14)
return ax;
def overlay_trajectory_contour(ax,trajectory, label,color='k',lw=2):
xs=trajectory[:,0]
ys=trajectory[:,1]
ax.plot(xs,ys, color, label=label,lw=lw)
return ax;
#DEFINE SURFACES WE WILL WORK WITH
#Define monkey saddle and gradient
def monkey_saddle(x,y):
return x**3 - 3*x*y**2
def grad_monkey_saddle(params):
x=params[0]
y=params[1]
grad_x= 3*x**2-3*y**2
grad_y= -6*x*y
return [grad_x,grad_y]
#Define saddle surface
def saddle_surface(x,y,a=1,b=1):
return a*x**2-b*y**2
def grad_saddle_surface(params,a=1,b=1):
x=params[0]
y=params[1]
grad_x= a*x
grad_y= -1*b*y
return [grad_x,grad_y]
# Define minima_surface
def minima_surface(x,y,a=1,b=1):
return a*x**2+b*y**2-1
def grad_minima_surface(params,a=1,b=1):
x=params[0]
y=params[1]
grad_x= 2*a*x
grad_y= 2*b*y
return [grad_x,grad_y]
def beales_function(x,y):
f=np.square(1.5-x+x*y)+np.square(2.25-x+x*y*y)+np.square(2.625-x+x*y**3)
return f
def grad_beales_function(params):
x=params[0]
y=params[1]
grad_x=2*(1.5-x+x*y)*(-1+y)+2*(2.25-x+x*y**2)*(-1+y**2)+2*(2.625-x+x*y**3)*(-1+y**3)
grad_y=2*(1.5-x+x*y)*x+4*(2.25-x+x*y**2)*x*y+6*(2.625-x+x*y**3)*x*y**2
return [grad_x,grad_y]
def contour_beales_function():
#plot beales function
x, y = np.meshgrid(np.arange(-4.5, 4.5, 0.2), np.arange(-4.5, 4.5, 0.2))
fig, ax = plt.subplots(figsize=(10, 6))
z=beales_function(x,y)
ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap="RdYlBu_r")
ax.plot(3,0.5, 'r*', markersize=18)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_xlim((-4.5, 4.5))
ax.set_ylim((-4.5, 4.5))
return fig,ax
#Make plots of surfaces
x, y = np.mgrid[-1:1:31j, -1:1:31j]
fig1,ax1=plot_surface(x,y,monkey_saddle(x,y))
fig2,ax2=plot_surface(x,y,saddle_surface(x,y))
fig3,ax3=plot_surface(x,y,minima_surface(x,y,5),0)
#Contour plot of Beale's Function
fig4,ax4 =contour_beales_function()
plt.show()
#This writes a simple gradient descent, gradient descent+ momentum,
#nesterov.
#Mean-gradient based methods
def gd(grad, init, n_epochs=1000, eta=10**-4, noise_strength=0):
#This is a simple optimizer
params=np.array(init)
param_traj=np.zeros([n_epochs+1,2])
param_traj[0,]=init
v=0;
for j in range(n_epochs):
noise=noise_strength*np.random.randn(params.size)
v=eta*(np.array(grad(params))+noise)
params=params-v
param_traj[j+1,]=params
return param_traj
def gd_with_mom(grad, init, n_epochs=5000, eta=10**-4, gamma=0.9,noise_strength=0):
params=np.array(init)
param_traj=np.zeros([n_epochs+1,2])
param_traj[0,]=init
v=0
for j in range(n_epochs):
noise=noise_strength*np.random.randn(params.size)
v=gamma*v+eta*(np.array(grad(params))+noise)
params=params-v
param_traj[j+1,]=params
return param_traj
def NAG(grad, init, n_epochs=5000, eta=10**-4, gamma=0.9,noise_strength=0):
params=np.array(init)
param_traj=np.zeros([n_epochs+1,2])
param_traj[0,]=init
v=0
for j in range(n_epochs):
noise=noise_strength*np.random.randn(params.size)
params_nesterov=params-gamma*v
v=gamma*v+eta*(np.array(grad(params_nesterov))+noise)
params=params-v
param_traj[j+1,]=params
return param_traj
# Investigate effect of learning rate in GD
x, y = np.meshgrid(np.arange(-4.5, 4.5, 0.2), np.arange(-4.5, 4.5, 0.2))
fig, ax = plt.subplots(figsize=(10, 6))
z=minima_surface(x,y,1,10)
ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap="RdYlBu_r")
ax.plot(0,0, 'r*', markersize=18)
#initial point
init1=[-2,4]
init2=[-1.7,4]
init3=[-1.5,4]
init4=[-3,4.5]
eta1=0.1
eta2=0.5
eta3=1
eta4=1.01
gd_1=gd(grad_minima_surface,init1, n_epochs=100, eta=eta1)
gd_2=gd(grad_minima_surface,init2, n_epochs=100, eta=eta2)
gd_3=gd(grad_minima_surface,init3, n_epochs=100, eta=eta3)
gd_4=gd(grad_minima_surface,init4, n_epochs=10, eta=eta4)
#print(gd_1)
overlay_trajectory_contour(ax,gd_1,'$\eta=$%s'% eta1,'g--*', lw=0.5)
overlay_trajectory_contour(ax,gd_2,'$\eta=$%s'% eta2,'b-<', lw=0.5)
overlay_trajectory_contour(ax,gd_3,'$\eta=$%s'% eta3,'->', lw=0.5)
overlay_trajectory_contour(ax,gd_4,'$\eta=$%s'% eta4,'c-o', lw=0.5)
plt.legend(loc=2)
plt.show()
fig.savefig("GD3regimes.pdf", bbox_inches='tight')
#Methods that exploit first and second moments of gradient: RMS-PROP and ADAMS
def rms_prop(grad, init, n_epochs=5000, eta=10**-3, beta=0.9,epsilon=10**-8,noise_strength=0):
params=np.array(init)
param_traj=np.zeros([n_epochs+1,2])
param_traj[0,]=init#Import relevant packages
grad_sq=0;
for j in range(n_epochs):
noise=noise_strength*np.random.randn(params.size)
g=np.array(grad(params))+noise
grad_sq=beta*grad_sq+(1-beta)*g*g
v=eta*np.divide(g,np.sqrt(grad_sq+epsilon))
params= params-v
param_traj[j+1,]=params
return param_traj
def adams(grad, init, n_epochs=5000, eta=10**-4, gamma=0.9, beta=0.99,epsilon=10**-8,noise_strength=0):
params=np.array(init)
param_traj=np.zeros([n_epochs+1,2])
param_traj[0,]=init
v=0;
grad_sq=0;
for j in range(n_epochs):
noise=noise_strength*np.random.randn(params.size)
g=np.array(grad(params))+noise
v=gamma*v+(1-gamma)*g
grad_sq=beta*grad_sq+(1-beta)*g*g
v_hat=v/(1-gamma)
grad_sq_hat=grad_sq/(1-beta)
params=params-eta*np.divide(v_hat,np.sqrt(grad_sq_hat+epsilon))
param_traj[j+1,]=params
return param_traj
#Make static plot of the results
Nsteps=10**4
lr_l=10**-3
lr_s=10**-6
init1=np.array([4,3])
fig1, ax1=contour_beales_function()
gd_trajectory1=gd(grad_beales_function,init1,Nsteps, eta=lr_s, noise_strength=0)
gdm_trajectory1=gd_with_mom(grad_beales_function,init1,Nsteps,eta=lr_s, gamma=0.9,noise_strength=0)
NAG_trajectory1=NAG(grad_beales_function,init1,Nsteps,eta=lr_s, gamma=0.9,noise_strength=0)
rms_prop_trajectory1=rms_prop(grad_beales_function,init1,Nsteps,eta=lr_l, beta=0.9,epsilon=10**-8,noise_strength=0)
adam_trajectory1=adams(grad_beales_function,init1,Nsteps,eta=lr_l, gamma=0.9, beta=0.99,epsilon=10**-8,noise_strength=0)
overlay_trajectory_contour_M(ax1,gd_trajectory1, 'GD','k')
overlay_trajectory_contour_M(ax1,gd_trajectory1, 'GDM','m')
overlay_trajectory_contour_M(ax1,NAG_trajectory1, 'NAG','c--')
overlay_trajectory_contour_M(ax1,rms_prop_trajectory1,'RMS', 'b-.')
overlay_trajectory_contour_M(ax1,adam_trajectory1,'ADAMS', 'r')
plt.legend(loc=2)
#init2=np.array([1.5,1.5])
#gd_trajectory2=gd(grad_beales_function,init2,Nsteps, eta=10**-6, noise_strength=0)
#gdm_trajectory2=gd_with_mom(grad_beales_function,init2,Nsteps,eta=10**-6, gamma=0.9,noise_strength=0)
#NAG_trajectory2=NAG(grad_beales_function,init2,Nsteps,eta=10**-6, gamma=0.9,noise_strength=0)
#rms_prop_trajectory2=rms_prop(grad_beales_function,init2,Nsteps,eta=10**-3, beta=0.9,epsilon=10**-8,noise_strength=0)
#adam_trajectory2=adams(grad_beales_function,init2,Nsteps,eta=10**-3, gamma=0.9, beta=0.99,epsilon=10**-8,noise_strength=0)
#overlay_trajectory_contour_M(ax1,gdm_trajectory2, 'GDM','m')
#overlay_trajectory_contour_M(ax1,NAG_trajectory2, 'NAG','c--')
#overlay_trajectory_contour_M(ax1,rms_prop_trajectory2,'RMS', 'b-.')
#overlay_trajectory_contour_M(ax1,adam_trajectory2,'ADAMS', 'r')
init3=np.array([-1,4])
gd_trajectory3=gd(grad_beales_function,init3,10**5, eta=lr_s, noise_strength=0)
gdm_trajectory3=gd_with_mom(grad_beales_function,init3,10**5,eta=lr_s, gamma=0.9,noise_strength=0)
NAG_trajectory3=NAG(grad_beales_function,init3,Nsteps,eta=lr_s, gamma=0.9,noise_strength=0)
rms_prop_trajectory3=rms_prop(grad_beales_function,init3,Nsteps,eta=lr_l, beta=0.9,epsilon=10**-8,noise_strength=0)
adam_trajectory3=adams(grad_beales_function,init3,Nsteps,eta=lr_l, gamma=0.9, beta=0.99,epsilon=10**-8,noise_strength=0)
overlay_trajectory_contour_M(ax1,gd_trajectory3, 'GD','k')
overlay_trajectory_contour_M(ax1,gdm_trajectory3, 'GDM','m')
overlay_trajectory_contour_M(ax1,NAG_trajectory3, 'NAG','c--')
overlay_trajectory_contour_M(ax1,rms_prop_trajectory3,'RMS', 'b-.')
overlay_trajectory_contour_M(ax1,adam_trajectory3,'ADAMS', 'r')
init4=np.array([-2,-4])
gd_trajectory4=gd(grad_beales_function,init4,Nsteps, eta=lr_s, noise_strength=0)
gdm_trajectory4=gd_with_mom(grad_beales_function,init4,Nsteps,eta=lr_s, gamma=0.9,noise_strength=0)
NAG_trajectory4=NAG(grad_beales_function,init4,Nsteps,eta=lr_s, gamma=0.9,noise_strength=0)
rms_prop_trajectory4=rms_prop(grad_beales_function,init4,Nsteps,eta=lr_l, beta=0.9,epsilon=10**-8,noise_strength=0)
adam_trajectory4=adams(grad_beales_function,init4,Nsteps,eta=lr_l, gamma=0.9, beta=0.99,epsilon=10**-8,noise_strength=0)
overlay_trajectory_contour_M(ax1,gd_trajectory4, 'GD','k')
overlay_trajectory_contour_M(ax1,gdm_trajectory4, 'GDM','m')
overlay_trajectory_contour_M(ax1,NAG_trajectory4, 'NAG','c--')
overlay_trajectory_contour_M(ax1,rms_prop_trajectory4,'RMS', 'b-.')
overlay_trajectory_contour_M(ax1,adam_trajectory4,'ADAMS', 'r')
plt.show()
In [1]:
print ("Section 3: Linear Regression")
%matplotlib inline
#This code is modified from plot_cv_diabetes.py in the skit-learn documentation
#and plot_ridge_path.py
from __future__ import print_function
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn import datasets, linear_model
#Load Training Data set with 200 examples
number_examples=200
diabetes = datasets.load_diabetes()
X = diabetes.data[:number_examples]
y = diabetes.target[:number_examples]
#Set up Lasso and Ridge Regression models
ridge=linear_model.Ridge()
lasso = linear_model.Lasso()
#Chooose paths
alphas = np.logspace(-2, 2, 10)
# To see how well we learn, we partition the dataset into a training set with 150
# as well as a test set with 50 examples. We record their errors respectively.
n_samples = 150
n_samples_train = 100
X_train, X_test = X[:n_samples_train], X[n_samples_train:]
y_train, y_test = y[:n_samples_train], y[n_samples_train:]
train_errors_ridge = list()
test_errors_ridge = list()
train_errors_lasso = list()
test_errors_lasso = list()
#Initialize coeffficients for ridge regression and Lasso
coefs_ridge = []
coefs_lasso=[]
for a in alphas:
ridge.set_params(alpha=a)
ridge.fit(X_train, y_train)
coefs_ridge.append(ridge.coef_)
# Use the coefficient of determination R^2 as the performance of prediction.
train_errors_ridge.append(ridge.score(X_train, y_train))
test_errors_ridge.append(ridge.score(X_test, y_test))
lasso.set_params(alpha=a)
lasso.fit(X_train, y_train)
coefs_lasso.append(lasso.coef_)
train_errors_lasso.append(lasso.score(X_train, y_train))
test_errors_lasso.append(lasso.score(X_test, y_test))
###############################################################################
# Display results
# First see how the 10 features we learned scale as we change the regularization parameter
plt.subplot(1,2,1)
plt.semilogx(alphas, np.abs(coefs_ridge))
axes = plt.gca()
#ax.set_xscale('log')
#ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel(r'$\lambda$',fontsize=18)
plt.ylabel('$|w_i|$',fontsize=18)
plt.title('Ridge')
#plt.savefig("Ridge_sparsity_scale.pdf.pdf")
plt.subplot(1,2,2)
plt.semilogx(alphas, np.abs(coefs_lasso))
axes = plt.gca()
#ax.set_xscale('log')
#ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel(r'$\lambda$',fontsize=18)
#plt.ylabel('$|\mathbf{w}|$',fontsize=18)
plt.title('LASSO')
#plt.savefig("LASSO_sparsity_scale.pdf")
plt.show()
# Plot our performance on both the training and test data
plt.semilogx(alphas, train_errors_ridge, 'b',label='Train (Ridge)')
plt.semilogx(alphas, test_errors_ridge, '--b',label='Test (Ridge)')
plt.semilogx(alphas, train_errors_lasso, 'g',label='Train (LASSO)')
plt.semilogx(alphas, test_errors_lasso, '--g',label='Test (LASSO)')
#plt.vlines(alpha_optim, plt.ylim()[0], np.max(test_errors), color='k',
# linewidth=3, label='Optimum on test')
plt.legend(loc='upper right')
plt.ylim([0, 1.0])
plt.xlabel(r'$\lambda$',fontsize=18)
plt.ylabel('Performance')
#plt.savefig("Ridge_LASSO_sparsity_performance.pdf")
plt.show()
In [1]:
print ("Section 4: Hamiltonian")
import numpy as np
import scipy.sparse as sp
np.random.seed(12)
import warnings
#Comment this to turn on warnings
warnings.filterwarnings('ignore')
### define Ising model aprams
# system size
L=40
# create 10000 random Ising states
states=np.random.choice([-1, 1], size=(10000,L))
def ising_energies(states,L):
"""
This function calculates the energies of the states in the nn Ising Hamiltonian
"""
J=np.zeros((L,L),)
for i in range(L):
J[i,(i+1)%L]-=1.0
# compute energies
E = np.einsum('...i,ij,...j->...',states,J,states)
return E
# calculate Ising energies
energies=ising_energies(states,L)
# reshape Ising states into RL samples: S_iS_j --> X_p
states=np.einsum('...i,...j->...ij', states, states)
shape=states.shape
states=states.reshape((shape[0],shape[1]*shape[2]))
# build final data set
Data=[states,energies]
# define number of samples
n_samples=400
# define train and test data sets
X_train=Data[0][:n_samples]
Y_train=Data[1][:n_samples] #+ np.random.normal(0,4.0,size=X_train.shape[0])
X_test=Data[0][n_samples:3*n_samples//2]
Y_test=Data[1][n_samples:3*n_samples//2] #+ np.random.normal(0,4.0,size=X_test.shape[0])
from sklearn import linear_model
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn
%matplotlib inline
# set up Lasso and Ridge Regression models
leastsq=linear_model.LinearRegression()
ridge=linear_model.Ridge()
lasso = linear_model.Lasso()
# define error lists
train_errors_leastsq = []
test_errors_leastsq = []
train_errors_ridge = []
test_errors_ridge = []
train_errors_lasso = []
test_errors_lasso = []
# set refularisations trength values
lmbdas = np.logspace(-4, 5, 10)
#Initialize coeffficients for ridge regression and Lasso
coefs_leastsq = []
coefs_ridge = []
coefs_lasso=[]
for lmbda in lmbdas:
### ordinary least squares
leastsq.fit(X_train, Y_train) # fit model
coefs_leastsq.append(leastsq.coef_) # store weights
# use the coefficient of determination R^2 as the performance of prediction.
train_errors_leastsq.append(leastsq.score(X_train, Y_train))
test_errors_leastsq.append(leastsq.score(X_test,Y_test))
### apply Ridge regression
ridge.set_params(alpha=lmbda) # set regularisation parameter
ridge.fit(X_train, Y_train) # fit model
coefs_ridge.append(ridge.coef_) # store weights
# use the coefficient of determination R^2 as the performance of prediction.
train_errors_ridge.append(ridge.score(X_train, Y_train))
test_errors_ridge.append(ridge.score(X_test,Y_test))
### apply Ridge regression
lasso.set_params(alpha=lmbda) # set regularisation parameter
lasso.fit(X_train, Y_train) # fit model
coefs_lasso.append(lasso.coef_) # store weights
# use the coefficient of determination R^2 as the performance of prediction.
train_errors_lasso.append(lasso.score(X_train, Y_train))
test_errors_lasso.append(lasso.score(X_test,Y_test))
### plot Ising interaction J
J_leastsq=np.array(leastsq.coef_).reshape((L,L))
J_ridge=np.array(ridge.coef_).reshape((L,L))
J_lasso=np.array(lasso.coef_).reshape((L,L))
cmap_args=dict(vmin=-1., vmax=1., cmap='seismic')
fig, axarr = plt.subplots(nrows=1, ncols=3)
axarr[0].imshow(J_leastsq,**cmap_args)
axarr[0].set_title('$\\mathrm{OLS}$',fontsize=16)
axarr[0].tick_params(labelsize=16)
axarr[1].imshow(J_ridge,**cmap_args)
axarr[1].set_title('$\\mathrm{Ridge},\ \\lambda=%.4f$' %(lmbda),fontsize=16)
axarr[1].tick_params(labelsize=16)
im=axarr[2].imshow(J_lasso,**cmap_args)
axarr[2].set_title('$\\mathrm{LASSO},\ \\lambda=%.4f$' %(lmbda),fontsize=16)
axarr[2].tick_params(labelsize=16)
divider = make_axes_locatable(axarr[2])
cax = divider.append_axes("right", size="5%", pad=0.05)
cbar=fig.colorbar(im, cax=cax)
cbar.ax.set_yticklabels(np.arange(-1.0, 1.0+0.25, 0.25),fontsize=14)
cbar.set_label('$J_{i,j}$',labelpad=-40, y=1.12,fontsize=16,rotation=0)
fig.subplots_adjust(right=2.0)
plt.show()
# Plot our performance on both the training and test data
plt.semilogx(lmbdas, train_errors_leastsq, 'b',label='Train (OLS)')
plt.semilogx(lmbdas, test_errors_leastsq,'--b',label='Test (OLS)')
plt.semilogx(lmbdas, train_errors_ridge,'r',label='Train (Ridge)',linewidth=1)
plt.semilogx(lmbdas, test_errors_ridge,'--r',label='Test (Ridge)',linewidth=1)
plt.semilogx(lmbdas, train_errors_lasso, 'g',label='Train (LASSO)')
plt.semilogx(lmbdas, test_errors_lasso, '--g',label='Test (LASSO)')
fig = plt.gcf()
fig.set_size_inches(10.0, 6.0)
#plt.vlines(alpha_optim, plt.ylim()[0], np.max(test_errors), color='k',
# linewidth=3, label='Optimum on test')
plt.legend(loc='lower left',fontsize=16)
plt.ylim([-0.01, 1.01])
plt.xlim([min(lmbdas), max(lmbdas)])
plt.xlabel(r'$\lambda$',fontsize=16)
plt.ylabel('Performance',fontsize=16)
plt.tick_params(labelsize=16)
plt.show()
In [3]:
print ("notebook 6: Logistic Regression, Ising")
import numpy as np
import warnings
#Comment this to turn on warnings
warnings.filterwarnings('ignore')
np.random.seed() # shuffle random seed generator
# Ising model parameters
L=40 # linear system size
J=-1.0 # Ising interaction
T=np.linspace(0.25,4.0,16) # set of temperatures
T_c=2.26 # Onsager critical temperature in the TD limit
##### prepare training and test data sets
import pickle,os
from sklearn.model_selection import train_test_split
###### define ML parameters
num_classes=2
train_to_test_ratio=0.5 # training samples
# path to data directory
path_to_data=os.path.expanduser('~')+'/Dropbox/MachineLearningReview/Datasets/isingMC/'
# load data
file_name = "Ising2DFM_reSample_L40_T=All.pkl" # this file contains 16*10000 samples taken in T=np.arange(0.25,4.0001,0.25)
data = pickle.load(open(path_to_data+file_name,'rb')) # pickle reads the file and returns the Python object (1D array, compressed bits)
data = np.unpackbits(data).reshape(-1, 1600) # Decompress array and reshape for convenience
data=data.astype('int')
data[np.where(data==0)]=-1 # map 0 state to -1 (Ising variable can take values +/-1)
file_name = "Ising2DFM_reSample_L40_T=All_labels.pkl" # this file contains 16*10000 samples taken in T=np.arange(0.25,4.0001,0.25)
labels = pickle.load(open(path_to_data+file_name,'rb')) # pickle reads the file and returns the Python object (here just a 1D array with the binary labels)
# divide data into ordered, critical and disordered
X_ordered=data[:70000,:]
Y_ordered=labels[:70000]
X_critical=data[70000:100000,:]
Y_critical=labels[70000:100000]
X_disordered=data[100000:,:]
Y_disordered=labels[100000:]
del data,labels
# define training and test data sets
X=np.concatenate((X_ordered,X_disordered))
Y=np.concatenate((Y_ordered,Y_disordered))
# pick random data points from ordered and disordered states
# to create the training and test sets
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=train_to_test_ratio)
# full data set
X=np.concatenate((X_critical,X))
Y=np.concatenate((Y_critical,Y))
print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)
print()
print(X_train.shape[0], 'train samples')
print(X_critical.shape[0], 'critical samples')
print(X_test.shape[0], 'test samples')
In [ ]:
print ("notebook 5: Logistic Regression, SUSY")
# Importing the SUSY Data set
import sys, os
import pandas as pd
import numpy as np
import warnings
#Commnet this to turn on warnings
warnings.filterwarnings('ignore')
seed=12
np.random.seed(seed)
import tensorflow as tf
# suppress tflow compilation warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.set_random_seed(seed)
columns=["signal", "lepton 1 pT", "lepton 1 eta", "lepton 1 phi", "lepton 2 pT", "lepton 2 eta",
"lepton 2 phi", "missing energy magnitude", "missing energy phi", "MET_rel",
"axial MET", "M_R", "M_TR_2", "R", "MT2", "S_R", "M_Delta_R", "dPhi_r_b", "cos(theta_r1)"]
#Load 1,500,000 rows as train data, 50,000 as test data
filename="/Users/dylansmith/Downloads/SUSY.csv"
df_train=pd.read_csv(filename,names=columns,nrows=1500000,engine='python')
df_test=pd.read_csv(filename,names=columns,nrows=50000, skiprows=1500000,engine='python')
%matplotlib inline
#import ml_style as style
import matplotlib as mpl
import matplotlib.pyplot as plt
#mpl.rcParams.update(style.style)
def getTrainData(nVar):
ExamplesTrain = df_train.iloc[:,1:nVar+1].as_matrix()
#now the signal
ResultsTrain = df_train.iloc[:,0:1].as_matrix()
return (ExamplesTrain,ResultsTrain)
def getTestData(nVar):
ExamplesTest = df_test.iloc[:,1:nVar+1].as_matrix()
#now the signal
ResultsTest = df_test.iloc[:,0:1].as_matrix()
return (ExamplesTest,ResultsTest)
#let's define this as a function so we can call it easily
def runTensorFlowRegression(nVar,alpha):
#make data array placeholder for just first 8 simple features
x = tf.placeholder(tf.float32,[None,nVar])
#make weights and bias
W = tf.Variable(tf.zeros([nVar,2])) #we will make y 'onehot' 0 bit is bkg, 1 bit is signal
b = tf.Variable(tf.zeros([2]))
#make 'answer variable'
y = tf.nn.softmax(tf.matmul(x, W) + b)
#placeholder for correct answer
y_ = tf.placeholder(tf.float32, [None, 2])
#cross entropy
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y,labels=y_)+alpha*tf.nn.l2_loss(W))
#define training step
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
#initialize variables
init = tf.global_variables_initializer()
#setup session
sess = tf.Session()
sess.run(init)
#ok now everything is setup for tensorflow, but we need the data in a useful form
#first let's get the variables
Var_train,Sig_train_bit1 = getTrainData(nVar)
#now the signal
Sig_train_bit0 = Sig_train_bit1.copy()
Sig_train_bit0 = 1 - Sig_train_bit0
Sig_train = np.column_stack((Sig_train_bit0,Sig_train_bit1))
#now run with batches of 100 data points
for i in range(0,15000):
start = i*100
end = (i+1)*100-1
batch_x = Var_train[start:end]
batch_y = Sig_train[start:end]
sess.run(train_step, feed_dict={x: batch_x, y_: batch_y})
#now test accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#setup test data
Var_test = df_test.iloc[:,1:nVar+1].as_matrix()
#now the signal
Sig_test_bit1 = df_test.iloc[:,0:1].as_matrix()
Sig_test_bit0 = Sig_test_bit1.copy()
Sig_test_bit0 = 1 - Sig_test_bit0
Sig_test = np.column_stack((Sig_test_bit0,Sig_test_bit1))
print("Accuracy for alpha %.1E : %.3f" %(alpha,sess.run(accuracy, feed_dict={x: Var_test, y_: Sig_test})))
#get the weights
weights = W.eval(session=sess)
#get probabilities assigned (i.e. evaluate y on test data)
probs = y.eval(feed_dict = {x: Var_test}, session = sess)
#print probs
#now let's get the signal efficiency and background rejection on the test data
Acceptance = []
Rejection = []
theshes = np.arange(0,1,0.01)
for thresh in theshes:
it=0
nBkg = 0.0
nBkgCor = 0.0
nSig = 0.0
nSigCor = 0.0
for prob in probs:
if prob[0] > thresh:
sig=False
else:
sig=True
if Sig_test_bit1[it][0]: #actual signal
nSig+=1
if sig:
nSigCor+=1
else: #actual background
nBkg+=1
if not sig:
nBkgCor+=1
it+=1
#now append signal efficiency and bakcground rejection
Acceptance.append(nSigCor/nSig)
Rejection.append(nBkgCor/nBkg)
return (probs,Acceptance,Rejection)
alphas = np.logspace(-10,1,11)
fig = plt.figure()
ax = fig.add_subplot(111)
it=0
for alpha in alphas:
c1 = 1.*( float(it) % 3.)/3.0
c2 = 1.*( float(it) % 9.)/9.0
c3 = 1.*( float(it) % 27.)/27.0
probsSimple,accep,rej = runTensorFlowRegression(8,alpha)
ax.scatter(accep,rej,c=[c1,c2,c3],label='Alpha: %.1E' %alpha)
it+=1
ax.set_xlabel('signal efficiency')
ax.set_ylabel('background rejection')
plt.legend(loc='lower left', fontsize = 'small');
plt.show()
#now let's investigate how mixed the events are
probsSimple,accep,rej = runTensorFlowRegression(8,.00001)
Signal = df_test.iloc[:,0:1]
#print probsSimple[:,1]
df_test_acc = pd.DataFrame({'PROB':probsSimple[:,1]})
df_test_acc['SIG']=Signal
df_test_acc_sig = df_test_acc.query('SIG==1')
df_test_acc_bkg = df_test_acc.query('SIG==0')
df_test_acc_sig.plot(kind='hist',y='PROB',color='blue',alpha=0.5,bins=np.linspace(0,1,10),label='Signal')
df_test_acc_bkg.plot(kind='hist',y='PROB',color='red',label='Background')
alphas = np.logspace(-10,1,11)
fig = plt.figure()
ax = fig.add_subplot(111)
it=0
for alpha in alphas:
c1 = 1.*( float(it) % 3.)/3.0
c2 = 1.*( float(it) % 9.)/9.0
c3 = 1.*( float(it) % 27.)/27.0
probsSimple,accep,rej = runTensorFlowRegression(18,alpha)
ax.scatter(accep,rej,c=[c1,c2,c3],label='Alpha: %.1E' %alpha)
it+=1
ax.set_xlabel('signal efficiency')
ax.set_ylabel('background rejection')
plt.legend(loc='lower left', fontsize = 'small');
plt.show()
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
def runSciKitRegressionL2(nVar,alpha):
X_train, y_train = getTrainData(nVar)
X_test, y_test = getTestData(nVar)
clf = SGDClassifier(loss="log", penalty="l2",alpha=alpha,max_iter=5,tol=None)
clf.fit(X_train,y_train.ravel())
predictions = clf.predict(X_test)
print('Accuracy on test data with alpha %.2E : %.3f' %(alpha,clf.score(X_test,y_test)) )
probs = clf.predict_proba(X_test)
#print probs
#get signal acceptance and background rejection
thresholds = np.arange(0,1,.01)
Acceptance = []
BkgRejection = []
for thresh in thresholds:
it=0
nPredSig=0.0
nPredBkg=0.0
nTotSig=0.0
nTotBkg=0.0
for prob in probs:
if prob[1]>thresh:
predSig=True
else:
predSig=False
if y_test[it][0]:
Sig = True
else:
Sig = False
if Sig:
if predSig:
nPredSig+=1
nTotSig+=1
else:
if not predSig:
nPredBkg+=1
nTotBkg+=1
it+=1
Acceptance.append(nPredSig/nTotSig)
BkgRejection.append(nPredBkg/nTotBkg)
return (probs,Acceptance,BkgRejection)
def runSciKitRegressionL1(nVar,alpha):
X_train, y_train = getTrainData(nVar)
X_test, y_test = getTestData(nVar)
clf = SGDClassifier(loss="log", penalty="l1",alpha=alpha,max_iter=5,tol=None)
clf.fit(X_train,y_train.ravel())
predictions = clf.predict(X_test)
print('Accuracy on test data with alpha %.2E : %.3f' %(alpha,clf.score(X_test,y_test)) )
probs = clf.predict_proba(X_test)
#print probs
#get signal acceptance and background rejection
thresholds = np.arange(0,1,.01)
Acceptance = []
BkgRejection = []
for thresh in thresholds:
it=0
nPredSig=0.0
nPredBkg=0.0
nTotSig=0.0
nTotBkg=0.0
for prob in probs:
if prob[1]>thresh:
predSig=True
else:
predSig=False
if y_test[it][0]:
Sig = True
else:
Sig = False
if Sig:
if predSig:
nPredSig+=1
nTotSig+=1
else:
if not predSig:
nPredBkg+=1
nTotBkg+=1
it+=1
Acceptance.append(nPredSig/nTotSig)
BkgRejection.append(nPredBkg/nTotBkg)
return (probs,Acceptance,BkgRejection)
alphas = np.logspace(-10,1,11)
fig = plt.figure()
ax = fig.add_subplot(111)
it=0
for alpha in alphas:
c1 = 1.*( float(it) % 3.)/3.0
c2 = 1.*( float(it) % 9.)/9.0
c3 = 1.*( float(it) % 27.)/27.0
probs,accept,rej = runSciKitRegressionL1(8,alpha)
ax.scatter(accept,rej,c=[c1,c2,c3],label='Alpha: %.1E' %alpha)
it+=1
ax.set_xlabel('signal efficiency')
ax.set_ylabel('background rejection')
plt.legend(loc='lower left', fontsize = 'small');
plt.show()
#now let's investigate how mixed the events are
probsSimple,accep,rej = runSciKitRegressionL1(8,.5)
Signal = df_test.iloc[:,0:1]
#print probsSimple[:,1]
df_test_acc = pd.DataFrame({'PROB':probsSimple[:,1]})
df_test_acc['SIG']=Signal
df_test_acc_sig = df_test_acc.query('SIG==1')
df_test_acc_bkg = df_test_acc.query('SIG==0')
df_test_acc_sig.plot(kind='hist',y='PROB',color='blue',alpha=0.5,bins=np.linspace(0,1,10),label='Signal')
df_test_acc_bkg.plot(kind='hist',y='PROB',color='red',label='Background')
alphas = np.logspace(-10,1,11)
fig = plt.figure()
ax = fig.add_subplot(111)
it=0
for alpha in alphas:
c1 = 1.*( float(it) % 3.)/3.0
c2 = 1.*( float(it) % 9.)/9.0
c3 = 1.*( float(it) % 27.)/27.0
probs,accept,rej = runSciKitRegressionL1(18,alpha)
ax.scatter(accept,rej,c=[c1,c2,c3],label='Alpha: %.1E' %alpha)
it+=1
ax.set_xlabel('signal efficiency')
ax.set_ylabel('background rejection')
plt.legend(loc='lower left', fontsize = 'small');
plt.show()
alphas = np.logspace(-10,1,11)
fig = plt.figure()
ax = fig.add_subplot(111)
it=0
for alpha in alphas:
c1 = 1.*( float(it) % 3.)/3.0
c2 = 1.*( float(it) % 9.)/9.0
c3 = 1.*( float(it) % 27.)/27.0
probs,accept,rej = runSciKitRegressionL2(8,alpha)
ax.scatter(accept,rej,c=[c1,c2,c3],label='Alpha: %.1E' %alpha)
it+=1
ax.set_xlabel('signal efficiency')
ax.set_ylabel('background rejection')
plt.legend(loc='lower left', fontsize = 'small');
plt.show()
alphas = np.logspace(-10,1,11)
fig = plt.figure()
ax = fig.add_subplot(111)
it=0
for alpha in alphas:
c1 = 1.*( float(it) % 3.)/3.0
c2 = 1.*( float(it) % 9.)/9.0
c3 = 1.*( float(it) % 27.)/27.0
probs,accept,rej = runSciKitRegressionL2(18,alpha)
ax.scatter(accept,rej,c=[c1,c2,c3],label='Alpha: %.1E' %alpha)
it+=1
ax.set_xlabel('signal efficiency')
ax.set_ylabel('background rejection')
plt.legend(loc='lower left', fontsize = 'small');
plt.show()
In [ ]: