Develop model that can be used to make accurate predictions
Images: www.cs.utexas.edu/~mooney/cs391L/slides/svm.ppt
Image: www.cs.colostate.edu/~asa/pdfs/howto.pdf
In [42]:
from sklearn.datasets import make_moons
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Create a 2D data set
X, Y = make_moons(noise=0.30,random_state = 1,n_samples = 10000)
# Identify sample classes
class0_actual = np.where(Y==0);
class1_actual = np.where(Y==1);
# Plot data
plt.scatter(X[class0_actual,0],X[class0_actual,1],c = 'red')
plt.scatter(X[class1_actual,0],X[class1_actual,1],c = 'blue')
Out[42]:
In [43]:
from sklearn import cross_validation
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.4, random_state = 0)
In [44]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1).fit(X_train, Y_train)
In [45]:
from sklearn.metrics import accuracy_score
accuracy = clf.score(X_test, Y_test)
print("Accuracy 2: %0.2f" % (accuracy))
In [46]:
from sklearn.metrics import confusion_matrix
Y_pred = clf.predict(X_test);
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(cm)
In [47]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Y_pred = clf.predict(X);
# Identify sample classes
class0 = np.where(Y_pred==0);
class1 = np.where(Y_pred==1);
# Plot data
plt.subplot(1, 2, 1)
plt.scatter(X[class0,0],X[class0,1],c = 'r')
plt.scatter(X[class1,0],X[class1,1],c = 'b')
plt.title('Predicted Classes')
plt.subplot(1,2,2)
plt.scatter(X[class0_actual,0],X[class0_actual,1],c = 'red')
plt.scatter(X[class1_actual,0],X[class1_actual,1],c = 'blue')
plt.title('Actual Classes')
Out[47]:
In [48]:
clf = svm.SVC(kernel='rbf',gamma=1,C=1).fit(X_train,Y_train)
In [49]:
accuracy = clf.score(X_test, Y_test)
print("Accuracy 2: %0.2f" % (accuracy))
In [50]:
from sklearn.metrics import confusion_matrix
Y_pred = clf.predict(X_test);
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(cm)
In [51]:
import numpy as np
# Get predicted classes for all data
Y_pred = clf.predict(X);
# Identify sample classes
class0 = np.where(Y_pred==0);
class1 = np.where(Y_pred==1);
# Plot data
plt.subplot(1, 2, 1)
plt.scatter(X[class0,0],X[class0,1],c = 'r')
plt.scatter(X[class1,0],X[class1,1],c = 'b')
plt.title('Predicted Classes')
plt.subplot(1,2,2)
plt.scatter(X[class0_actual,0],X[class0_actual,1],c = 'red')
plt.scatter(X[class1_actual,0],X[class1_actual,1],c = 'blue')
plt.title('Actual Classes')
Out[51]:
In [32]:
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-2, 1],'C': [1, 10, 100]},{'kernel': ['linear'], 'C': [1, 10, 100]}, {'kernel':['poly'], 'C':[1,10,100]}]
clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5)
clf.fit(X, Y)
print clf.best_params_
"The data is in the comma separated values (CSV) format. Each row in this data set represents a molecule. The first column contains experimental data describing a real biological response; the molecule was seen to elicit this response (1), or not (0). The remaining columns represent molecular descriptors (d1 through d1776), these are caclulated properties that can capture some of the characteristics of the molecule - for example size, shape, or elemental constitution. The descriptor matrix has been normalized."
In [33]:
import pickle
X_data = pickle.load( open( "X_data.p", "rb" ) )
Y_data = pickle.load( open( "Y_data.p", "rb" ) )
In [34]:
from sklearn import cross_validation
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_data, Y_data, test_size=0.4, random_state = 0)
In [35]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1).fit(X_train, Y_train)
In [36]:
accuracy = clf.score(X_test, Y_test)
print("Accuracy 2: %0.2f" % (accuracy))
In [52]:
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-2, 1],'C': [1, 10, 100]},{'kernel': ['linear'], 'C': [1, 10, 100]}, {'kernel':['poly'], 'C':[1,10,100]}]
clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5)
clf.fit(X_data, Y_data)
print clf.best_params_
In [ ]:
from sklearn import svm
from sklearn import cross_validation
clf = svm.SVC(kernel='rbf', C=10).fit(X_train, Y_train)
accuracy = clf.score(X_test, Y_test)
print("Accuracy 2: %0.2f" % (accuracy))
In [ ]: