Chapter 08


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

7


In [4]:
from sklearn.ensemble import RandomForestRegressor
boston_file_name ='../data/Boston.csv'
bostons = pd.read_csv(boston_file_name, index_col=0)
bostons.head()


Out[4]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2

In [5]:
bostons.columns


Out[5]:
Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'black', 'lstat', 'medv'],
      dtype='object')

In [6]:
X = bostons[['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'black', 'lstat', 'medv']].values
y = bostons['crim'].values
clf1 = RandomForestRegressor(n_estimators=25,max_features='sqrt')
clf1.fit(X,y)
print(clf1.score(X,y))


0.915251068291

In [7]:
clf2 = RandomForestRegressor(n_estimators=500,max_features='sqrt')
clf2.fit(X,y)
print(clf2.score(X,y))


0.938260147606

8


In [8]:
carseats_file_name = '../data/Carseats.csv'
carseats = pd.read_csv(carseats_file_name, index_col=0)
carseats.head()


Out[8]:
Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US
1 9.50 138 73 11 276 120 Bad 42 17 Yes Yes
2 11.22 111 48 16 260 83 Good 65 10 Yes Yes
3 10.06 113 35 10 269 80 Medium 59 12 Yes Yes
4 7.40 117 100 4 466 97 Medium 55 14 Yes Yes
5 4.15 141 64 3 340 128 Bad 38 13 Yes No

In [16]:
sales_mean = np.mean(carseats['Sales'].values)
carseats['SaleStatus'] = ['Good' if sale > sales_mean else 'Bad' for sale in carseats['Sales']]
carseats=carseats.replace(['Yes','No'],[1,-1])

8(a)


In [19]:
X = carseats[['CompPrice', 'Income', 'Advertising', 'Population', 'Price', 'Age', 'Education']].values
y = carseats['SaleStatus'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

8(b)


In [20]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)


Out[20]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [21]:
tree.score(X_test, y_test)


Out[21]:
0.74375000000000002

8(c)


In [28]:
from sklearn.ensemble import BaggingClassifier
clf = DecisionTreeClassifier()
bagging = BaggingClassifier(clf, n_estimators=100)
bagging.fit(X_train, y_train)
bagging.score(X_test, y_test)


Out[28]:
0.71250000000000002

9


In [32]:
oj_file_name = '../data/OJ.csv'
ojs = pd.read_csv(oj_file_name, index_col=0)
print(ojs.shape)
ojs.head()


(1070, 18)
Out[32]:
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
1 CH 237 1 1.75 1.99 0.00 0.0 0 0 0.500000 1.99 1.75 0.24 No 0.000000 0.000000 0.24 1
2 CH 239 1 1.75 1.99 0.00 0.3 0 1 0.600000 1.69 1.75 -0.06 No 0.150754 0.000000 0.24 1
3 CH 245 1 1.86 2.09 0.17 0.0 0 0 0.680000 2.09 1.69 0.40 No 0.000000 0.091398 0.23 1
4 MM 227 1 1.69 1.69 0.00 0.0 0 0 0.400000 1.69 1.69 0.00 No 0.000000 0.000000 0.00 1
5 CH 228 7 1.69 1.69 0.00 0.0 0 0 0.956535 1.69 1.69 0.00 Yes 0.000000 0.000000 0.00 0

9(a)


In [33]:
X = ojs[['WeekofPurchase','PriceCH','PriceMM','DiscCH','DiscMM',
         'SpecialCH','SpecialMM','LoyalCH','SalePriceMM','SalePriceCH',
         'PriceDiff','PctDiscMM','PctDiscCH','ListPriceDiff']].values
y = ojs['Purchase'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=270/1070, random_state=0)

9(b)


In [34]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_train, y_train)


Out[34]:
0.98875000000000002

9(e)


In [36]:
tree.score(X_test, y_test)


Out[36]:
0.74814814814814812

In [38]:
pred = tree.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)


Out[38]:
array([[119,  34],
       [ 34,  83]])

9(f,g,h)


In [39]:
from sklearn.model_selection import cross_val_score
def scores_by_tree_depth(depth):
    clf = DecisionTreeClassifier(max_depth=depth)
    scores = cross_val_score(clf,X_train, y_train, cv=5)
    return scores.mean()

depthes = range(5,15)
scores = []
for depth in depthes:
    scores.append(scores_by_tree_depth(depth))
plt.plot(depthes, scores)
plt.show()


when the tree's max-depth is 7, it has highest scores.

9(k)


In [40]:
tree = DecisionTreeClassifier(max_depth=7)
tree.fit(X_train, y_train)
tree.score(X_test, y_test)


Out[40]:
0.77777777777777779

10


In [2]:
hitters_file_name = '../data/Hitters.csv'
hitters = pd.read_csv(hitters_file_name, index_col=0)
hitters.head()


Out[2]:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks League Division PutOuts Assists Errors Salary NewLeague
-Andy Allanson 293 66 1 30 29 14 1 293 66 1 30 29 14 A E 446 33 20 NaN A
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69 321 414 375 N W 632 43 10 475.0 N
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63 224 266 263 A W 880 82 14 480.0 A
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225 828 838 354 N E 200 11 3 500.0 N
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12 48 46 33 N E 805 40 4 91.5 N

10(a)


In [31]:
hitters = hitters.dropna()
hitters.head()


Out[31]:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun ... PutOuts Assists Errors Salary League_A League_N Division_E Division_W NewLeague_A NewLeague_N
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69 ... 632 43 10 475.0 0 1 0 1 0 1
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63 ... 880 82 14 480.0 1 0 0 1 1 0
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225 ... 200 11 3 500.0 0 1 1 0 0 1
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12 ... 805 40 4 91.5 0 1 1 0 0 1
-Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19 ... 282 421 25 750.0 1 0 0 1 1 0

5 rows × 23 columns

10(b)


In [32]:
hitters.shape[0]


Out[32]:
263

In [33]:
hitters = pd.get_dummies(hitters, prefix=['League', 'Division', 'NewLeague'])
hitters.head()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-33-ef3d19f83de3> in <module>()
----> 1 hitters = pd.get_dummies(hitters, prefix=['League', 'Division', 'NewLeague'])
      2 hitters.head()

/Users/gaufung/anaconda/lib/python3.6/site-packages/pandas/core/reshape.py in get_dummies(data, prefix, prefix_sep, dummy_na, columns, sparse, drop_first)
   1072                                                        len(columns_to_encode)))
   1073 
-> 1074         check_len(prefix, 'prefix')
   1075         check_len(prefix_sep, 'prefix_sep')
   1076         if isinstance(prefix, compat.string_types):

/Users/gaufung/anaconda/lib/python3.6/site-packages/pandas/core/reshape.py in check_len(item, name)
   1070                 if not len(item) == len(columns_to_encode):
   1071                     raise ValueError(length_msg.format(name, len(item),
-> 1072                                                        len(columns_to_encode)))
   1073 
   1074         check_len(prefix, 'prefix')

ValueError: Length of 'prefix' (3) did not match the length of the columns being encoded (0).

In [34]:
hitters.columns


Out[34]:
Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
       'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists',
       'Errors', 'Salary', 'League_A', 'League_N', 'Division_E', 'Division_W',
       'NewLeague_A', 'NewLeague_N'],
      dtype='object')

In [35]:
X = hitters[['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
            'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists',
            'Errors', 'League_A', 'League_N', 'Division_E', 'Division_W',
            'NewLeague_A', 'NewLeague_N' ]].values
y = hitters['Salary'].values

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (263-200)/263)

10(c,d)


In [37]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
learning_trates = np.linspace(0.01, 0.2, 20)
scores = []
for learning_trate in learning_trates:
    estimator = DecisionTreeRegressor()
    clf = AdaBoostRegressor(estimator, n_estimators=1000, learning_rate=learning_trate)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
plt.plot(learning_trates, scores)
plt.show()


10(e)


In [18]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))


0.561498691173

10(g)


In [23]:
from sklearn.ensemble import BaggingRegressor
estimator = DecisionTreeRegressor()
clf = BaggingRegressor(estimator, 1000)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))


0.597167114709

11


In [24]:
caravan_file_name = '../data/Caravan.csv'
caravans = pd.read_csv(caravan_file_name, index_col=0)
caravans.head()


Out[24]:
MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE MRELGE ... APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND Purchase
1 33 1 3 2 8 0 5 1 3 7 ... 0 0 0 1 0 0 0 0 0 No
2 37 1 2 2 8 1 4 1 4 6 ... 0 0 0 1 0 0 0 0 0 No
3 37 1 2 2 8 0 4 2 4 3 ... 0 0 0 1 0 0 0 0 0 No
4 9 1 3 3 3 2 3 2 4 5 ... 0 0 0 1 0 0 0 0 0 No
5 40 1 4 2 10 1 4 1 4 7 ... 0 0 0 1 0 0 0 0 0 No

5 rows × 86 columns

11(a)


In [25]:
caravans.columns


Out[25]:
Index(['MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD', 'MGODRK',
       'MGODPR', 'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA', 'MRELOV', 'MFALLEEN',
       'MFGEKIND', 'MFWEKIND', 'MOPLHOOG', 'MOPLMIDD', 'MOPLLAAG', 'MBERHOOG',
       'MBERZELF', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO', 'MSKA',
       'MSKB1', 'MSKB2', 'MSKC', 'MSKD', 'MHHUUR', 'MHKOOP', 'MAUT1', 'MAUT2',
       'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045', 'MINK4575',
       'MINK7512', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART', 'PWABEDR',
       'PWALAND', 'PPERSAUT', 'PBESAUT', 'PMOTSCO', 'PVRAAUT', 'PAANHANG',
       'PTRACTOR', 'PWERKT', 'PBROM', 'PLEVEN', 'PPERSONG', 'PGEZONG',
       'PWAOREG', 'PBRAND', 'PZEILPL', 'PPLEZIER', 'PFIETS', 'PINBOED',
       'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'APERSAUT', 'ABESAUT',
       'AMOTSCO', 'AVRAAUT', 'AAANHANG', 'ATRACTOR', 'AWERKT', 'ABROM',
       'ALEVEN', 'APERSONG', 'AGEZONG', 'AWAOREG', 'ABRAND', 'AZEILPL',
       'APLEZIER', 'AFIETS', 'AINBOED', 'ABYSTAND', 'Purchase'],
      dtype='object')

In [26]:
X = caravans[['MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD', 'MGODRK',
       'MGODPR', 'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA', 'MRELOV', 'MFALLEEN',
       'MFGEKIND', 'MFWEKIND', 'MOPLHOOG', 'MOPLMIDD', 'MOPLLAAG', 'MBERHOOG',
       'MBERZELF', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO', 'MSKA',
       'MSKB1', 'MSKB2', 'MSKC', 'MSKD', 'MHHUUR', 'MHKOOP', 'MAUT1', 'MAUT2',
       'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045', 'MINK4575',
       'MINK7512', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART', 'PWABEDR',
       'PWALAND', 'PPERSAUT', 'PBESAUT', 'PMOTSCO', 'PVRAAUT', 'PAANHANG',
       'PTRACTOR', 'PWERKT', 'PBROM', 'PLEVEN', 'PPERSONG', 'PGEZONG',
       'PWAOREG', 'PBRAND', 'PZEILPL', 'PPLEZIER', 'PFIETS', 'PINBOED',
       'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'APERSAUT', 'ABESAUT',
       'AMOTSCO', 'AVRAAUT', 'AAANHANG', 'ATRACTOR', 'AWERKT', 'ABROM',
       'ALEVEN', 'APERSONG', 'AGEZONG', 'AWAOREG', 'ABRAND', 'AZEILPL',
       'APLEZIER', 'AFIETS', 'AINBOED', 'ABYSTAND']].values
y = caravans['Purchase'].values

In [27]:
caravans.shape[0]


Out[27]:
5822

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (5822-1000)/5822)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
estimator = DecisionTreeClassifier()
clf = AdaBoostClassifier(estimator, 1000, 0.01)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))


0.913936126089