Chapter 08
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
from sklearn.ensemble import RandomForestRegressor
boston_file_name ='../data/Boston.csv'
bostons = pd.read_csv(boston_file_name, index_col=0)
bostons.head()
Out[4]:
In [5]:
bostons.columns
Out[5]:
In [6]:
X = bostons[['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
'ptratio', 'black', 'lstat', 'medv']].values
y = bostons['crim'].values
clf1 = RandomForestRegressor(n_estimators=25,max_features='sqrt')
clf1.fit(X,y)
print(clf1.score(X,y))
In [7]:
clf2 = RandomForestRegressor(n_estimators=500,max_features='sqrt')
clf2.fit(X,y)
print(clf2.score(X,y))
In [8]:
carseats_file_name = '../data/Carseats.csv'
carseats = pd.read_csv(carseats_file_name, index_col=0)
carseats.head()
Out[8]:
In [16]:
sales_mean = np.mean(carseats['Sales'].values)
carseats['SaleStatus'] = ['Good' if sale > sales_mean else 'Bad' for sale in carseats['Sales']]
carseats=carseats.replace(['Yes','No'],[1,-1])
In [19]:
X = carseats[['CompPrice', 'Income', 'Advertising', 'Population', 'Price', 'Age', 'Education']].values
y = carseats['SaleStatus'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
In [20]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
Out[20]:
In [21]:
tree.score(X_test, y_test)
Out[21]:
In [28]:
from sklearn.ensemble import BaggingClassifier
clf = DecisionTreeClassifier()
bagging = BaggingClassifier(clf, n_estimators=100)
bagging.fit(X_train, y_train)
bagging.score(X_test, y_test)
Out[28]:
In [32]:
oj_file_name = '../data/OJ.csv'
ojs = pd.read_csv(oj_file_name, index_col=0)
print(ojs.shape)
ojs.head()
Out[32]:
In [33]:
X = ojs[['WeekofPurchase','PriceCH','PriceMM','DiscCH','DiscMM',
'SpecialCH','SpecialMM','LoyalCH','SalePriceMM','SalePriceCH',
'PriceDiff','PctDiscMM','PctDiscCH','ListPriceDiff']].values
y = ojs['Purchase'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=270/1070, random_state=0)
In [34]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_train, y_train)
Out[34]:
In [36]:
tree.score(X_test, y_test)
Out[36]:
In [38]:
pred = tree.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)
Out[38]:
In [39]:
from sklearn.model_selection import cross_val_score
def scores_by_tree_depth(depth):
clf = DecisionTreeClassifier(max_depth=depth)
scores = cross_val_score(clf,X_train, y_train, cv=5)
return scores.mean()
depthes = range(5,15)
scores = []
for depth in depthes:
scores.append(scores_by_tree_depth(depth))
plt.plot(depthes, scores)
plt.show()
when the tree's max-depth is 7, it has highest scores.
In [40]:
tree = DecisionTreeClassifier(max_depth=7)
tree.fit(X_train, y_train)
tree.score(X_test, y_test)
Out[40]:
In [2]:
hitters_file_name = '../data/Hitters.csv'
hitters = pd.read_csv(hitters_file_name, index_col=0)
hitters.head()
Out[2]:
In [31]:
hitters = hitters.dropna()
hitters.head()
Out[31]:
In [32]:
hitters.shape[0]
Out[32]:
In [33]:
hitters = pd.get_dummies(hitters, prefix=['League', 'Division', 'NewLeague'])
hitters.head()
In [34]:
hitters.columns
Out[34]:
In [35]:
X = hitters[['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',
'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists',
'Errors', 'League_A', 'League_N', 'Division_E', 'Division_W',
'NewLeague_A', 'NewLeague_N' ]].values
y = hitters['Salary'].values
In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (263-200)/263)
In [37]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
learning_trates = np.linspace(0.01, 0.2, 20)
scores = []
for learning_trate in learning_trates:
estimator = DecisionTreeRegressor()
clf = AdaBoostRegressor(estimator, n_estimators=1000, learning_rate=learning_trate)
clf.fit(X_train, y_train)
scores.append(clf.score(X_test, y_test))
plt.plot(learning_trates, scores)
plt.show()
In [18]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
In [23]:
from sklearn.ensemble import BaggingRegressor
estimator = DecisionTreeRegressor()
clf = BaggingRegressor(estimator, 1000)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
In [24]:
caravan_file_name = '../data/Caravan.csv'
caravans = pd.read_csv(caravan_file_name, index_col=0)
caravans.head()
Out[24]:
In [25]:
caravans.columns
Out[25]:
In [26]:
X = caravans[['MOSTYPE', 'MAANTHUI', 'MGEMOMV', 'MGEMLEEF', 'MOSHOOFD', 'MGODRK',
'MGODPR', 'MGODOV', 'MGODGE', 'MRELGE', 'MRELSA', 'MRELOV', 'MFALLEEN',
'MFGEKIND', 'MFWEKIND', 'MOPLHOOG', 'MOPLMIDD', 'MOPLLAAG', 'MBERHOOG',
'MBERZELF', 'MBERBOER', 'MBERMIDD', 'MBERARBG', 'MBERARBO', 'MSKA',
'MSKB1', 'MSKB2', 'MSKC', 'MSKD', 'MHHUUR', 'MHKOOP', 'MAUT1', 'MAUT2',
'MAUT0', 'MZFONDS', 'MZPART', 'MINKM30', 'MINK3045', 'MINK4575',
'MINK7512', 'MINK123M', 'MINKGEM', 'MKOOPKLA', 'PWAPART', 'PWABEDR',
'PWALAND', 'PPERSAUT', 'PBESAUT', 'PMOTSCO', 'PVRAAUT', 'PAANHANG',
'PTRACTOR', 'PWERKT', 'PBROM', 'PLEVEN', 'PPERSONG', 'PGEZONG',
'PWAOREG', 'PBRAND', 'PZEILPL', 'PPLEZIER', 'PFIETS', 'PINBOED',
'PBYSTAND', 'AWAPART', 'AWABEDR', 'AWALAND', 'APERSAUT', 'ABESAUT',
'AMOTSCO', 'AVRAAUT', 'AAANHANG', 'ATRACTOR', 'AWERKT', 'ABROM',
'ALEVEN', 'APERSONG', 'AGEZONG', 'AWAOREG', 'ABRAND', 'AZEILPL',
'APLEZIER', 'AFIETS', 'AINBOED', 'ABYSTAND']].values
y = caravans['Purchase'].values
In [27]:
caravans.shape[0]
Out[27]:
In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (5822-1000)/5822)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
estimator = DecisionTreeClassifier()
clf = AdaBoostClassifier(estimator, 1000, 0.01)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))