Chapter 09
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [28]:
np.random.seed(131)
x_1 = np.random.normal(0,1,100)
x_2 = 3 * x_1*x_1 + 4 + np.random.normal(0,1,100)
indices = np.random.choice(100, 50, replace=False)
x_2[indices] += 6
X = np.vstack((x_1,x_2)).T
y = np.full((100,1),1.0)
y[indices] = -1.0
In [31]:
for idx, y_value in enumerate(y):
if y_value == 1.0:
plt.scatter([X[idx,0]],[X[idx,1]],c='b',marker='+')
else:
plt.scatter([X[idx,0]],[X[idx,1]],c='g', marker='*')
In [33]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
y = y.reshape(100,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
# use polynoimal kernel
poly_svc = SVC(kernel='poly')
poly_svc.fit(X_train, y_train)
poly_pred = poly_svc.predict(X_test)
print('polynomial kernel test data score: ', accuracy_score(poly_pred, y_test))
In [34]:
# use rbf kernel
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X_train, y_train)
rbf_pred = rbf_svc.predict(X_test)
print('rbf kernel test dataset score: ', accuracy_score(rbf_pred, y_test))
In [44]:
x1 = np.random.uniform(0,1,500) - 0.5
x2 = np.random.uniform(0,1,500) - 0.5
y =1*(x1*x1 - x2*x2>0)
In [47]:
plt.scatter(x1[y==0],x2[y==0], c='r', marker='+')
plt.scatter(x1[y==1],x2[y==1], c='g', marker='*')
Out[47]:
In [48]:
from sklearn.linear_model import LogisticRegression
X = np.vstack((x1,x2)).T
lr = LogisticRegression()
lr.fit(X, y)
y_pred = lr.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
Out[48]:
In [50]:
X_new = np.vstack((
np.power(X[:,0],2),
np.power(X[:,1],2),
X[:,0] * X[:,1])).T
lr = LogisticRegression()
lr.fit(X_new, y)
y_pred = lr.predict(X_new)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
Out[50]:
In [51]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(X,y)
y_pred = linear_svc.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
Out[51]:
In [53]:
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X, y)
y_pred = rbf_svc.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
Out[53]:
In [66]:
auto_file_path = '../data/Auto'
autos = pd.read_table(auto_file_path,sep='\s+',na_values='?')
autos=autos.dropna()
autos.head()
Out[66]:
In [68]:
mpg_median = np.median(autos['mpg'])
autos['mpg_status'] = [1 if item >= mpg_median else 0 for item in autos['mpg']]
autos.head()
Out[68]:
In [71]:
from pandas.tools.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(15, 15))
scatter_matrix(autos,ax=ax);
In [75]:
from sklearn.cross_validation import cross_val_score
X = autos[['displacement','horsepower','weight','acceleration']].values
y = autos['mpg_status'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
Cs = [1,10,50,100,500,1000]
scores = []
for c in Cs:
clf = LinearSVC(C=c)
score = cross_val_score(clf, X_train, y_train, cv=5)
scores.append(score.mean())
plt.plot(Cs,scores)
Out[75]:
In [90]:
clf = LinearSVC(C=500)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred, y_test))
In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# set the parameter by cross-validation
tuned_parameters = [
{
'kernel':['rbf'],
'gamma':[1e-3,1e-4],
'C':[1,10,100,1000]
},
{
'kernel':['poly'],
'C':[1,10,100,1000]
}
]
scores = ['precision', 'recall']
for score in scores:
clf = GridSearchCV(SVC(C=1), tuned_parameters,cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(clf.best_params_)
In [3]:
oj_file_path = '../data/OJ.csv'
oj = pd.read_csv(oj_file_path, index_col=0)
oj.head()
Out[3]:
In [5]:
oj.columns
Out[5]:
In [6]:
df_X = oj[['WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM', 'PctDiscCH',
'ListPriceDiff', 'STORE']]
df_X = pd.get_dummies(df_X, prefix=['Store'])
In [7]:
df_X.head()
Out[7]:
In [9]:
X = df_X[['WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
'SalePriceCH', 'PriceDiff', 'Store_No','Store_Yes', 'PctDiscMM', 'PctDiscCH',
'ListPriceDiff', 'STORE']].values
y = oj['Purchase'].values
In [10]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=(y.shape[0]-800)/y.shape[0])
In [14]:
from sklearn.svm import SVC
clf = SVC(C=0.01,kernel='linear')
clf.fit(X_train, y_train)
Out[14]:
In [15]:
from sklearn.metrics import accuracy_score
train_pred = clf.predict(X_train)
print(accuracy_score(train_pred, y_train))
In [16]:
test_pred = clf.predict(X_test)
print(accuracy_score(test_pred, y_test))
In [21]:
from sklearn.cross_validation import cross_val_score
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
clf = SVC(C=c, kernel='linear')
score = cross_val_score(clf,X_train,y_train,cv=5)
scores.append(score.mean())
plt.plot(Cs,scores)
Out[21]:
In [22]:
clf = SVC(C=3,kernel='linear')
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
print('train data set score: ', accuracy_score(pred_train, y_train))
pred_test = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred_test, y_test))
In [23]:
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
clf = SVC(C=c, kernel='rbf')
score = cross_val_score(clf,X_train,y_train,cv=5)
scores.append(score.mean())
plt.plot(Cs,scores)
Out[23]:
In [24]:
clf = SVC(C=10,kernel='rbf')
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
print('train data set score: ', accuracy_score(pred_train, y_train))
pred_test = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred_test, y_test))
In [ ]:
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
clf = SVC(C=c, kernel='poly', degree=2)
score = cross_val_score(clf,X_train,y_train,cv=5)
scores.append(score.mean())
plt.plot(Cs,scores)
In [ ]: