In [54]:
import numpy as np
from sklearn.svm import SVR
from scipy import interpolate
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib nbagg
fsize = (10,6) #Tuple containing the inline figure size for all the plots

In this notebook we'll run through the steps to generate a surrogate model of an N-dimensional data set using scikit's State Vector Machine (SVM) Regression tools and a Radial Basis Function (RBF) kernel.

But first, let's start with a simple example where we model a 2D Sine function.

Modeling the Sine Function

First, we prepare our data. We'll generate a random data set x of input parameters, a vector of 1000 random values between 0 and 1 that are scaled. From the x values, we'll generate a set y of outputs using a Sine function with some added Gaussian noise. Our goal will be to generate a model using Radial Basis Functions which accurately predict the functional relationship between the x inputs and the y outputs.


In [55]:
N = 100
PI = 3.1415
domain = 4*PI # the domain in which the data points are generated
nf = 5 # noise factor - gain of the 
outliers = 97 # frequency of outliers in the data.

def f(x):
    return np.sin(x)

def generate_dataset(function):
    # Create our data X and outputs Y
    X = np.sort(domain * np.random.random((N,1)), axis = 0)
    Y = f(X).ravel()

    # Add Gaussian noise and some outliers to the outputs
    Y[::] += nf * 0.1 * (0.5 - np.random.rand(N))
    Y[::(N - outliers)] += nf * (0.5 - np.random.rand(int(np.ceil(N/float((N-outliers))))))
    
    return X, Y

X, Y = generate_dataset(f)

f = plt.figure(figsize=fsize)
plt.scatter(X, Y, label='Data', marker='x')
plt.plot(X, np.sin(X), label='Sine Function')
plt.legend(loc='lower left')
plt.show()


Now that we have our dataset, lets setup our our State Vector Regression Machine using an RBF kernel.


In [56]:
# Create an SVR instance using an RBF kernel
svr = SVR(kernel='rbf', C=1e4, gamma=0.01)

# Train the svr on the data and predict it over the same inputs.
rbf_model = svr.fit(X, Y)
y_rbf = rbf_model.predict(X)

# Now plot everything
f = plt.figure(figsize = fsize)
plt.scatter(X, Y, marker = 'x', label = 'data')
plt.plot(X, y_rbf, c = 'r', marker = '+', label='RBF Surrogate Model')
ysin = np.sin(X)
plt.plot(X, ysin, label = 'Sine Function')

plt.xlabel('data')
plt.ylabel('outputs')
plt.legend(loc='lower left')
plt.show()


Let's see how the C and gamma hyperparameters affect the fit of the SVR surrogate model.


In [58]:
from itertools import izip
n_hyp = 3

def f(x):
    return np.sin(x)

C_min = 0.1
C_max = 0.5
gamma_min = 1
gamma_max = 10

X, Y = generate_dataset(f)

fig, axes = plt.subplots(n_hyp, n_hyp, 
                         figsize = (12,10), 
                         sharex=True, sharey=True)

fig.subplots_adjust(hspace = 0.12, wspace=0.08,
                    left = 0.04, right = 0.98, top=0.96, bottom=0.16)
for n, C_val in enumerate(np.linspace(C_min, C_max, n_hyp)):
    for m, gamma_val in enumerate(np.linspace(gamma_min, gamma_max, n_hyp)):
        #print n, m, C_val, gamma_val
        # Create an SVR instance using an RBF kernel
        svr = SVR(kernel='rbf', C=C_val, gamma=gamma_val)

        # Train the svr on the data and predict it over the same inputs.
        rbf_model = svr.fit(X, Y)
        y_rbf = rbf_model.predict(X)
        
        ax = axes[n][m]
        ax.scatter(X, Y, marker = 'x', label = 'data')
        ax.plot(X, y_rbf, c = 'r', marker = '+', label='RBF Surrogate Model')
        y_true = f(X)
        ax.plot(X, y_true, label = 'Target Function')
        ax.set_title("C = %1.2f, gamma = %3.1f" %(C_val, gamma_val), loc='center')
        axes[n][m].scatter(X, y_rbf, label = "RBF fig: C = %d, gamma = %d" %(C_val, gamma_val))

plt.legend(loc=(-0.2,-0.6))
plt.show()


So for the simple 2D case where we're trying to model a Sine function, SVM RBF regression works pretty well (at least over the domain of the inputs we trained the SVM on). What happens if we take our SVM and use it to predict values across a domain we didn't train on?


In [53]:
Xnew = np.sort(8*PI * np.random.random((N*100,1)), 0)
y_pred = rbf_model.predict(Xnew)

f = plt.figure(figsize=fsize)
plt.scatter(X, Y, marker = 'x', label = 'data')
plt.plot(Xnew, y_pred, c = 'r', marker = '+', label = 'RBF Surrogate Model')
plt.plot(Xnew, np.sin(Xnew), label = 'Sine Function')

plt.xlabel('data')
plt.ylabel('outputs')
plt.legend(loc='lower right')
plt.show()

The RBF kernel doesn't predict values outside the range it was trained on very well...

Cross Validation

Let's try cross-validating our RBF model using scikit's cross_validation toolkit. We'll be performing Leave-One-Out (LOO) cross validation using the LeaveOneOut helper function.


In [5]:
from sklearn import cross_validation

#Use scikit's cross-validation tool and a leave-one-out generator to cross-validate
# our model. We'll use the mean squared error as a scoring metric and get the 
cv_scores = cross_validation.cross_val_score(svr, X, Y, 
                                             cv = cross_validation.LeaveOneOut(N),
                                             scoring = 'mean_squared_error')

# Get the average and standard deviation of the cross-validated scores
cv_mean = cv_scores.mean()
cv_std = cv_scores.std()

# Plot the cross-validation scores for each run of the LOO cross-validation
fig = plt.figure()
cv_x = np.arange(0,N)
cv_scores = -cv_scores
plt.plot(cv_x, cv_scores)
print "mean: %0.2f +/- %0.2f" % (cv_mean, 3 * cv_std)
plt.show()


mean: -0.83 +/- 5.01

We can use sklearn.cross_validation to make predictions from our model as well


In [16]:
predicted = cross_validation.cross_val_predict(svr, X, Y, 
                                               cv=cross_validation.LeaveOneOut(N))

fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.plot(np.sort(X, axis=0), predicted, c='r')
ax1.plot(np.sort(X, axis=0), np.sin(X), c='b')

ax1.set_xlabel('data')
ax1.set_ylabel('predicted values')
ax2.scatter(Y, predicted)
ax2.plot([np.amin(Y), np.amax(Y)], [np.amin(Y), np.amax(Y)], 'k--' )
plt.show()


Higher-Dimensional Data Sets

Let's generalize this to a higher dimensional dataset. We'll try to model a 2-parameter input 1-parameter output surface using RBF SVR.


In [9]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

N = 200
X = np.random.random((N,2))
def f(x,y):
    return np.sin(x) + np.sin(y)
Y = [f(x,y) for x, y in X]

# Create an SVR instance using an RBF kernel
C_param = 1e4
gamma_param = 0.1
svr = SVR(kernel='rbf', C=C_param, gamma=gamma_param)

# Train the svr on the data and predict it over the same inputs.
rbf_model = svr.fit(X, Y)
y_rbf = rbf_model.predict(X)


fig = plt.figure()
ax = fig.gca(projection='3d')
surf = ax.plot_surface(X[:,0], X[:,1], Y, cmap=cm.coolwarm)
fig.colorbar(surf)
print np.shape(X)
#plt.scatter(X[:,0], X[:,1], c = Y)
#plt.scatter(X[:,0], X[:,1], c = y_rbf)
plt.show()


(200, 2)

In [ ]: