In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import tree
from matplotlib import pyplot as plt
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.cross_validation import KFold,train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,mean_squared_error
import graphviz as gv
%matplotlib inline
In [4]:
#read data
hitters_df = pd.read_csv('data/Hitters.csv')
hitters_df.dropna(inplace=True)
hitters_df.head()
Out[4]:
In [6]:
collists = ['Years','Hits']
X = hitters_df[collists].values
y = hitters_df['Salary'].values
# log transform to y
y_log = [np.log2(i) for i in y]
In [7]:
# fit data
clf = tree.DecisionTreeRegressor()
clf.fit(X,y_log)
clf.feature_importances_
Out[7]:
In [9]:
# Visualise tree
dot_data = StringIO()
tree.export_graphviz(clf,out_file= dot_data)
data = open('hitter.dot','r').read()
#graph = pydot.graph_from_dot_file('hitter.dot')
In [33]:
g = gv.Source(data)
g.format = 'png'
g.render()
Image('Source.gv.png')
#we can see this tree is so bad beacause overfitting
Out[33]:
In [2]:
from sklearn.ensemble import BaggingRegressor
In [8]:
breg = BaggingRegressor()
breg.fit(X,y_log)
Out[8]:
In [12]:
ypred = breg.predict(X)
In [15]:
mean_squared_error(ypred,y_log)
Out[15]:
In [5]:
carseats_df = pd.read_csv('data/Carseats.csv')
#recode Sales
carseats_df["High"] = carseats_df["Sales"].map(lambda x : 0 if x <=8 else 1)
carseats_df["ShelveLoc"] = pd.factorize(carseats_df['ShelveLoc'])[0]
carseats_df["Urban"] = pd.factorize(carseats_df['Urban'])[0]
carseats_df["US"] = pd.factorize(carseats_df['US'])[0]
carseats_df.head()
Out[5]:
In [45]:
collist = [col for col in carseats_df.columns if col != 'Sales' and col !='High']
X = carseats_df[collist].values
y = carseats_df['High'].values
#fit by decision tree
dt = tree.DecisionTreeClassifier()
dt.fit(X,y)
Out[45]:
In [46]:
#Visualise tree
dot_data = StringIO()
tree.export_graphviz(dt,out_file=dot_data)
data = dot_data.getvalue()
g = gv.Source(data)
g.format = 'png'
g.filename = 'tree1'
g.render()
Out[46]:
In [47]:
#show
Image('tree1.png')
Out[47]:
In [50]:
#use cross_validation to choose max_depths
kfold = KFold(X.shape[0],n_folds = 10)
accs = []
max_depths = range(2,20)
for depth in max_depths:
k_accs = []
for train,test in kfold:
Xtrain,Xtest,ytrain,ytest = X[train],X[test],y[train],y[test]
dt = tree.DecisionTreeClassifier(max_depth=depth)
dt.fit(Xtrain,ytrain)
ypred = dt.predict(Xtest)
k_accs.append(accuracy_score(ytest,ypred))
accs.append(np.mean(k_accs))
#plot
plt.plot(max_depths,accs)
plt.xlabel('max_depths')
plt.ylabel('accs')
Out[50]:
In [56]:
dt1 = tree.DecisionTreeClassifier(max_depth=8)
dt1.fit(X,y)
Out[56]:
In [60]:
dot_data1 = StringIO()
tree.export_graphviz(dt1,out_file=dot_data1)
g = gv.Source(dot_data1.getvalue())
g.format = 'png'
g.filename = 'tree2'
g.render()
Out[60]:
In [61]:
Image('tree2.png')
Out[61]:
In [2]:
from sklearn.ensemble import RandomForestRegressor
In [3]:
boston_df = pd.read_csv('data/boston.csv')
boston_df.head()
Out[3]:
In [4]:
collists = [col for col in boston_df.columns if col != 'medv']
X = boston_df[collists].values
y = boston_df['medv'].values
#Split into training and test sets
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.10,random_state=50)
#Train random forest,beacause it use all features ,so it does equal to Bagging
rreg = RandomForestRegressor(n_estimators=500,oob_score=True)
rreg.fit(Xtrain,ytrain)
Out[4]:
In [63]:
# predice
ypred = rreg.predict(Xtest)
mean_squared_error(ypred,ytest),rreg.oob_score_
Out[63]:
In [94]:
## change max_features = sqrt(all_features) to run randomforest
rreg1 = RandomForestRegressor(n_estimators=500,oob_score=50,max_features='sqrt',random_state=50)
rreg1.fit(Xtrain,ytrain)
ypred = rreg1.predict(Xtest)
mean_squared_error(ypred,ytest),rreg1.oob_score_
#plot feature importance
feature_importance = rreg1.feature_importances_
#make importance relative to max importance
feature_importance = 100.0* (feature_importance/feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
plt.barh(pos,feature_importance[sorted_idx],align ='center')
cols = [collists[i] for i in sorted_idx]
plt.yticks(pos,cols)
plt.xlabel('Relative Importance')
plt.title("Variable Importance")
Out[94]:
In [71]:
# find best max_features
oob_scores = []
mses = []
num_feats = range(1,14)
for feat in num_feats:
regx = RandomForestRegressor(n_estimators=500,max_features=feat, oob_score=True)
regx.fit(Xtrain,ytrain)
ypred = regx.predict(Xtest)
mses.append(mean_squared_error(ypred,ytest))
oob_scores.append(regx.oob_score_)
plt.plot(num_feats,mses,color = 'b',lw = 2.5,label = '$MSES$')
plt.plot(num_feats,oob_scores,color = 'r',lw=2.5,label="$oob$")
Out[71]:
In [5]:
from sklearn.ensemble import AdaBoostRegressor
In [12]:
abreg = AdaBoostRegressor(tree.DecisionTreeRegressor(),n_estimators=5000)
abreg.fit(Xtrain,ytrain)
Out[12]:
In [13]:
abreg.feature_importances_
Out[13]:
In [15]:
ypred = abreg.predict(Xtest)
#plot feature importance
feature_importance = abreg.feature_importances_
#make importance relative to max importance
feature_importance = 100.0* (feature_importance/feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
plt.barh(pos,feature_importance[sorted_idx],align ='center')
cols = [collists[i] for i in sorted_idx]
plt.yticks(pos,cols)
plt.xlabel('Relative Importance')
plt.title("Variable Importance")
Out[15]:
In [18]:
mean_squared_error(ytest,ypred)
Out[18]:
In [ ]: