Chapter 09



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

4



In [28]:

    
np.random.seed(131)
x_1 = np.random.normal(0,1,100)
x_2 = 3 * x_1*x_1 + 4 + np.random.normal(0,1,100)
indices = np.random.choice(100, 50, replace=False)
x_2[indices] += 6
X = np.vstack((x_1,x_2)).T
y = np.full((100,1),1.0)
y[indices] = -1.0



In [31]:

    
for idx, y_value in enumerate(y):
    if y_value == 1.0:
        plt.scatter([X[idx,0]],[X[idx,1]],c='b',marker='+')
    else:
        plt.scatter([X[idx,0]],[X[idx,1]],c='g', marker='*')



In [33]:

    
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
y = y.reshape(100,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
# use polynoimal kernel
poly_svc = SVC(kernel='poly')
poly_svc.fit(X_train, y_train)
poly_pred = poly_svc.predict(X_test)
print('polynomial kernel test data score: ', accuracy_score(poly_pred, y_test))









    



polynomial kernel test data score:  0.975



In [34]:

    
# use rbf kernel 
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X_train, y_train)
rbf_pred = rbf_svc.predict(X_test)
print('rbf kernel test dataset score: ', accuracy_score(rbf_pred, y_test))









    



rbf kernel test dataset score:  0.875

5

5(a)



In [44]:

    
x1 = np.random.uniform(0,1,500) - 0.5
x2 = np.random.uniform(0,1,500) - 0.5
y =1*(x1*x1 - x2*x2>0)

5(b)



In [47]:

    
plt.scatter(x1[y==0],x2[y==0], c='r', marker='+')
plt.scatter(x1[y==1],x2[y==1], c='g', marker='*')









    Out[47]:





<matplotlib.collections.PathCollection at 0x11a8ae080>

5(c,d)



In [48]:

    
from sklearn.linear_model import LogisticRegression
X = np.vstack((x1,x2)).T
lr = LogisticRegression()
lr.fit(X, y)
y_pred = lr.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')









    Out[48]:





<matplotlib.collections.PathCollection at 0x11ab33f60>

5 (e,f)



In [50]:

    
X_new = np.vstack((
    np.power(X[:,0],2),
    np.power(X[:,1],2),
    X[:,0] * X[:,1])).T
lr = LogisticRegression()
lr.fit(X_new, y)
y_pred = lr.predict(X_new)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')









    Out[50]:





<matplotlib.collections.PathCollection at 0x11b3e6860>

6(g)



In [51]:

    
from sklearn.svm import LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(X,y)
y_pred = linear_svc.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')









    Out[51]:





<matplotlib.collections.PathCollection at 0x11b3b5b38>

6(h)



In [53]:

    
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X, y)
y_pred = rbf_svc.predict(X)
plt.scatter(X[y_pred==0,0],X[y_pred==0,1],c='r', marker='+')
plt.scatter(X[y_pred==1,0],X[y_pred==1,1],c='g', marker='*')









    Out[53]:





<matplotlib.collections.PathCollection at 0x11a6bff60>

6

Pass

7



In [66]:

    
auto_file_path = '../data/Auto'
autos = pd.read_table(auto_file_path,sep='\s+',na_values='?')
autos=autos.dropna()
autos.head()









    Out[66]:






  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      year
      origin
      name
    
  
  
    
      0
      18.0
      8
      307.0
      130.0
      3504.0
      12.0
      70
      1
      chevrolet chevelle malibu
    
    
      1
      15.0
      8
      350.0
      165.0
      3693.0
      11.5
      70
      1
      buick skylark 320
    
    
      2
      18.0
      8
      318.0
      150.0
      3436.0
      11.0
      70
      1
      plymouth satellite
    
    
      3
      16.0
      8
      304.0
      150.0
      3433.0
      12.0
      70
      1
      amc rebel sst
    
    
      4
      17.0
      8
      302.0
      140.0
      3449.0
      10.5
      70
      1
      ford torino

7(a)



In [68]:

    
mpg_median = np.median(autos['mpg'])
autos['mpg_status'] = [1 if item >= mpg_median else 0 for item in autos['mpg']]
autos.head()









    Out[68]:






  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      year
      origin
      name
      mpg_status
    
  
  
    
      0
      18.0
      8
      307.0
      130.0
      3504.0
      12.0
      70
      1
      chevrolet chevelle malibu
      0
    
    
      1
      15.0
      8
      350.0
      165.0
      3693.0
      11.5
      70
      1
      buick skylark 320
      0
    
    
      2
      18.0
      8
      318.0
      150.0
      3436.0
      11.0
      70
      1
      plymouth satellite
      0
    
    
      3
      16.0
      8
      304.0
      150.0
      3433.0
      12.0
      70
      1
      amc rebel sst
      0
    
    
      4
      17.0
      8
      302.0
      140.0
      3449.0
      10.5
      70
      1
      ford torino
      0

7(b)



In [71]:

    
from pandas.tools.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(15, 15))
scatter_matrix(autos,ax=ax);









    



/Users/gaufung/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2881: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  exec(code_obj, self.user_global_ns, self.user_ns)

7(c)



In [75]:

    
from sklearn.cross_validation import cross_val_score
X = autos[['displacement','horsepower','weight','acceleration']].values
y = autos['mpg_status'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
Cs = [1,10,50,100,500,1000]
scores = []
for c in Cs:
    clf = LinearSVC(C=c)
    score = cross_val_score(clf, X_train, y_train, cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)









    Out[75]:





[<matplotlib.lines.Line2D at 0x11fdcc5f8>]



In [90]:

    
clf = LinearSVC(C=500)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred, y_test))









    



test data set score:  0.866242038217

7(c,d)



In [ ]:

    
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# set the parameter by cross-validation
tuned_parameters = [
    {
        'kernel':['rbf'],
        'gamma':[1e-3,1e-4],
        'C':[1,10,100,1000]
    },
    {
        'kernel':['poly'],
        'C':[1,10,100,1000]
    }
]
scores = ['precision', 'recall']
for score in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters,cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)
    print("Best parameters set found on development set:")
    print(clf.best_params_)

8



In [3]:

    
oj_file_path = '../data/OJ.csv'
oj = pd.read_csv(oj_file_path, index_col=0)
oj.head()









    Out[3]:






  
    
      
      Purchase
      WeekofPurchase
      StoreID
      PriceCH
      PriceMM
      DiscCH
      DiscMM
      SpecialCH
      SpecialMM
      LoyalCH
      SalePriceMM
      SalePriceCH
      PriceDiff
      Store7
      PctDiscMM
      PctDiscCH
      ListPriceDiff
      STORE
    
  
  
    
      1
      CH
      237
      1
      1.75
      1.99
      0.00
      0.0
      0
      0
      0.500000
      1.99
      1.75
      0.24
      No
      0.000000
      0.000000
      0.24
      1
    
    
      2
      CH
      239
      1
      1.75
      1.99
      0.00
      0.3
      0
      1
      0.600000
      1.69
      1.75
      -0.06
      No
      0.150754
      0.000000
      0.24
      1
    
    
      3
      CH
      245
      1
      1.86
      2.09
      0.17
      0.0
      0
      0
      0.680000
      2.09
      1.69
      0.40
      No
      0.000000
      0.091398
      0.23
      1
    
    
      4
      MM
      227
      1
      1.69
      1.69
      0.00
      0.0
      0
      0
      0.400000
      1.69
      1.69
      0.00
      No
      0.000000
      0.000000
      0.00
      1
    
    
      5
      CH
      228
      7
      1.69
      1.69
      0.00
      0.0
      0
      0
      0.956535
      1.69
      1.69
      0.00
      Yes
      0.000000
      0.000000
      0.00
      0

8(a)



In [5]:

    
oj.columns









    Out[5]:





Index(['Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
       'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
       'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM', 'PctDiscCH',
       'ListPriceDiff', 'STORE'],
      dtype='object')



In [6]:

    
df_X = oj[['WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
       'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
       'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM', 'PctDiscCH',
       'ListPriceDiff', 'STORE']]
df_X = pd.get_dummies(df_X, prefix=['Store'])



In [7]:

    
df_X.head()









    Out[7]:






  
    
      
      WeekofPurchase
      StoreID
      PriceCH
      PriceMM
      DiscCH
      DiscMM
      SpecialCH
      SpecialMM
      LoyalCH
      SalePriceMM
      SalePriceCH
      PriceDiff
      PctDiscMM
      PctDiscCH
      ListPriceDiff
      STORE
      Store_No
      Store_Yes
    
  
  
    
      1
      237
      1
      1.75
      1.99
      0.00
      0.0
      0
      0
      0.500000
      1.99
      1.75
      0.24
      0.000000
      0.000000
      0.24
      1
      1
      0
    
    
      2
      239
      1
      1.75
      1.99
      0.00
      0.3
      0
      1
      0.600000
      1.69
      1.75
      -0.06
      0.150754
      0.000000
      0.24
      1
      1
      0
    
    
      3
      245
      1
      1.86
      2.09
      0.17
      0.0
      0
      0
      0.680000
      2.09
      1.69
      0.40
      0.000000
      0.091398
      0.23
      1
      1
      0
    
    
      4
      227
      1
      1.69
      1.69
      0.00
      0.0
      0
      0
      0.400000
      1.69
      1.69
      0.00
      0.000000
      0.000000
      0.00
      1
      1
      0
    
    
      5
      228
      7
      1.69
      1.69
      0.00
      0.0
      0
      0
      0.956535
      1.69
      1.69
      0.00
      0.000000
      0.000000
      0.00
      0
      0
      1



In [9]:

    
X = df_X[['WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
       'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
       'SalePriceCH', 'PriceDiff', 'Store_No','Store_Yes', 'PctDiscMM', 'PctDiscCH',
       'ListPriceDiff', 'STORE']].values
y = oj['Purchase'].values



In [10]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=(y.shape[0]-800)/y.shape[0])

8(b)



In [14]:

    
from sklearn.svm import SVC
clf = SVC(C=0.01,kernel='linear')
clf.fit(X_train, y_train)









    Out[14]:





SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

8(c)



In [15]:

    
from sklearn.metrics import accuracy_score
train_pred = clf.predict(X_train)
print(accuracy_score(train_pred, y_train))



In [16]:

    
test_pred = clf.predict(X_test)
print(accuracy_score(test_pred, y_test))









    



0.803703703704

8(d)



In [21]:

    
from sklearn.cross_validation import cross_val_score
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
    clf = SVC(C=c, kernel='linear')
    score = cross_val_score(clf,X_train,y_train,cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)









    Out[21]:





[<matplotlib.lines.Line2D at 0x11f5bf470>]

8(e)



In [22]:

    
clf = SVC(C=3,kernel='linear')
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
print('train data set score: ', accuracy_score(pred_train, y_train))
pred_test = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred_test, y_test))









    



train data set score:  0.8275
test data set score:  0.848148148148

8(f)



In [23]:

    
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
    clf = SVC(C=c, kernel='rbf')
    score = cross_val_score(clf,X_train,y_train,cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)









    Out[23]:





[<matplotlib.lines.Line2D at 0x11f60a9b0>]



In [24]:

    
clf = SVC(C=10,kernel='rbf')
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
print('train data set score: ', accuracy_score(pred_train, y_train))
pred_test = clf.predict(X_test)
print('test data set score: ', accuracy_score(pred_test, y_test))









    



train data set score:  0.85375
test data set score:  0.818518518519

8(g)



In [ ]:

    
Cs = np.linspace(0.1,10, 10)
scores = []
for c in Cs:
    clf = SVC(C=c, kernel='poly', degree=2)
    score = cross_val_score(clf,X_train,y_train,cv=5)
    scores.append(score.mean())
plt.plot(Cs,scores)



In [ ]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130.0	3504.0	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693.0	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150.0	3436.0	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150.0	3433.0	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140.0	3449.0	10.5	70	1	ford torino

	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
1	CH	237	1	1.75	1.99	0.00	0.0	0	0.500000	1.99	1.75	0.24	No	0.000000	0.000000	0.24	1
2	CH	239	1	1.75	1.99	0.00	0.3	1	0.600000	1.69	1.75	-0.06	No	0.150754	0.000000	0.24	1
3	CH	245	1	1.86	2.09	0.17	0.0	0	0.680000	2.09	1.69	0.40	No	0.000000	0.091398	0.23	1
4	MM	227	1	1.69	1.69	0.00	0.0	0	0.400000	1.69	1.69	0.00	No	0.000000	0.000000	0.00	1
5	CH	228	7	1.69	1.69	0.00	0.0	0	0.956535	1.69	1.69	0.00	Yes	0.000000	0.000000	0.00	0