Chapter 09
In [1]:
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
    
In [28]:
    
np.random.seed(131)
x_1 = np.random.normal(0,1,100)
x_2 = 3 * x_1*x_1 + 4 + np.random.normal(0,1,100)
indices = np.random.choice(100, 50, replace=False)
x_2[indices] += 6
X = np.vstack((x_1,x_2)).T
y = np.full((100,1),1.0)
y[indices] = -1.0
    
In [31]:
    
for idx, y_value in enumerate(y):
    if y_value == 1.0:
        plt.scatter([X[idx,0]],[X[idx,1]],c='b',marker='+')
    else:
        plt.scatter([X[idx,0]],[X[idx,1]],c='g', marker='*')
    
    
In [33]:
    
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
y = y.reshape(100,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
# use polynoimal kernel
poly_svc = SVC(kernel='poly')
poly_svc.fit(X_train, y_train)
poly_pred = poly_svc.predict(X_test)
print('polynomial kernel test data score: ', accuracy_score(poly_pred, y_test))
    
    
In [34]:
    
# use rbf kernel 
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X_train, y_train)
rbf_pred = rbf_svc.predict(X_test)
print('rbf kernel test dataset score: ', accuracy_score(rbf_pred, y_test))
    
    
In [44]:
    
x1 = np.random.uniform(0,1,500) - 0.5
x2 = np.random.uniform(0,1,500) - 0.5
y =1*(x1*x1 - x2*x2>0)
    
In [47]:
    
plt.scatter(x1[y==0],x2[y==0], c='r', marker='+')
plt.scatter(x1[y==1],x2[y==1], c='g', marker='*')
    
    Out[47]:
    
In [48]:
    
from sklearn.linear_model import LogisticRegression
X = np.vstack((x1,x2)).T
lr = LogisticRegression()
lr.fit(X, y)
y_pred = lr.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
    
    Out[48]:
    
In [50]:
    
X_new = np.vstack((
    np.power(X[:,0],2),
    np.power(X[:,1],2),
    X[:,0] * X[:,1])).T
lr = LogisticRegression()
lr.fit(X_new, y)
y_pred = lr.predict(X_new)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
    
    Out[50]:
    
In [51]:
    
from sklearn.svm import LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(X,y)
y_pred = linear_svc.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
    
    Out[51]:
    
In [53]:
    
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X, y)
y_pred = rbf_svc.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')
    
    Out[53]:
    
In [66]:
    
auto_file_path = '../data/Auto'
autos = pd.read_table(auto_file_path,sep='\s+',na_values='?')
autos=autos.dropna()
autos.head()
    
    Out[66]:
In [68]:
    
mpg_median = np.median(autos['mpg'])
autos['mpg_status'] = [1 if item >= mpg_median else 0 for item in autos['mpg']]
autos.head()
    
    Out[68]:
In [71]:
    
from pandas.tools.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(15, 15))
scatter_matrix(autos,ax=ax);
    
    
    
In [75]:
    
from sklearn.cross_validation import cross_val_score
X = autos[['displacement','horsepower','weight','acceleration']].values
y = autos['mpg_status'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
Cs = [1,10,50,100,500,1000]
scores = []
for c in Cs:
    clf = LinearSVC(C=c)
    score = cross_val_score(clf, X_train, y_train, cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)
    
    Out[75]:
    
In [90]:
    
clf = LinearSVC(C=500)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred, y_test))
    
    
In [ ]:
    
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# set the parameter by cross-validation
tuned_parameters = [
    {
        'kernel':['rbf'],
        'gamma':[1e-3,1e-4],
        'C':[1,10,100,1000]
    },
    {
        'kernel':['poly'],
        'C':[1,10,100,1000]
    }
]
scores = ['precision', 'recall']
for score in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters,cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)
    print("Best parameters set found on development set:")
    print(clf.best_params_)
    
In [3]:
    
oj_file_path = '../data/OJ.csv'
oj = pd.read_csv(oj_file_path, index_col=0)
oj.head()
    
    Out[3]:
In [5]:
    
oj.columns
    
    Out[5]:
In [6]:
    
df_X = oj[['WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
       'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
       'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM', 'PctDiscCH',
       'ListPriceDiff', 'STORE']]
df_X = pd.get_dummies(df_X, prefix=['Store'])
    
In [7]:
    
df_X.head()
    
    Out[7]:
In [9]:
    
X = df_X[['WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
       'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
       'SalePriceCH', 'PriceDiff', 'Store_No','Store_Yes', 'PctDiscMM', 'PctDiscCH',
       'ListPriceDiff', 'STORE']].values
y = oj['Purchase'].values
    
In [10]:
    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=(y.shape[0]-800)/y.shape[0])
    
In [14]:
    
from sklearn.svm import SVC
clf = SVC(C=0.01,kernel='linear')
clf.fit(X_train, y_train)
    
    Out[14]:
In [15]:
    
from sklearn.metrics import accuracy_score
train_pred = clf.predict(X_train)
print(accuracy_score(train_pred, y_train))
    
    
In [16]:
    
test_pred = clf.predict(X_test)
print(accuracy_score(test_pred, y_test))
    
    
In [21]:
    
from sklearn.cross_validation import cross_val_score
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
    clf = SVC(C=c, kernel='linear')
    score = cross_val_score(clf,X_train,y_train,cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)
    
    Out[21]:
    
In [22]:
    
clf = SVC(C=3,kernel='linear')
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
print('train data set score: ', accuracy_score(pred_train, y_train))
pred_test = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred_test, y_test))
    
    
In [23]:
    
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
    clf = SVC(C=c, kernel='rbf')
    score = cross_val_score(clf,X_train,y_train,cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)
    
    Out[23]:
    
In [24]:
    
clf = SVC(C=10,kernel='rbf')
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
print('train data set score: ', accuracy_score(pred_train, y_train))
pred_test = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred_test, y_test))
    
    
In [ ]:
    
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
    clf = SVC(C=c, kernel='poly', degree=2)
    score = cross_val_score(clf,X_train,y_train,cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)
    
In [ ]: