In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
# Scikit-Learn 官網作圖函式
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure(figsize=(10,6)) #調整作圖大小
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [4]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
def __init__(self, clf, seed=0, params=None, seed_flag=False):
params['random_state'] = seed
if(seed_flag == False):
params.pop('random_state')
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
print(self.clf.fit(x,y).feature_importances_)
return self.clf.fit(x,y).feature_importances_
In [5]:
#Out-of-Fold Predictions
def get_oof(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf): # kf:KFold(ntrain, n_folds= NFOLDS,...)
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te) # partial index from x_train
oof_test_skf[i, :] = clf.predict(x_test) # Row(n-Fold), Column(predict value)
oof_test[:] = np.mean(oof_test_skf, axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) #make sure return n-rows, 1-column shape.
In [6]:
import urllib.request
import os
In [7]:
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="data/titanic3.xls"
if not os.path.isfile(filepath):
result=urllib.request.urlretrieve(url,filepath)
print('downloaded:',result)
In [8]:
import numpy
import pandas as pd
In [9]:
all_df = pd.read_excel(filepath)
In [10]:
all_df[:2]
Out[10]:
In [11]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df=all_df[cols]
In [12]:
all_df[:2]
Out[12]:
In [13]:
all_df.isnull().sum()
Out[13]:
In [14]:
df=all_df.drop(['name'], axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)
In [15]:
df[:2]
Out[15]:
In [16]:
x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])
x_OneHot_df[:2]
Out[16]:
In [17]:
from sklearn.model_selection import train_test_split
X = x_OneHot_df[['pclass','sex','age','sibsp','parch','fare','embarked_C','embarked_Q','embarked_S']]
y = x_OneHot_df['survived']
test = X[1200:]
X_train, X_test, y_train, y_test = train_test_split(X[:1200], y[:1200], test_size = 0.3, random_state=100)
In [18]:
len(X), len(y), len(all_df), len(test)
Out[18]:
In [19]:
len(X_train), len(X_test), len(y_train), len(y_test)
Out[19]:
In [20]:
# StandardScaler - subtract the mean and divide by std
# MaxAbsScaler - transform down to [-1, 1] bounds
# QuantileTransformer - transform down to [0 1] bounds
from sklearn.preprocessing import StandardScaler #, MaxAbsScaler, QuantileTransformer
sc = StandardScaler() #MaxAbsScaler()
sc.fit(X)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
test_std = sc.transform(test)
In [21]:
from sklearn.cross_validation import KFold
NFOLDS = 5 # set folds for out-of-fold prediction
SEED = 0 # for reproducibility
ntrain = X_train_std.shape[0] # X.shape[0]
ntest = test_std.shape[0] # test.shape[0]
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
In [22]:
# Put in our parameters for said classifiers
# Decision Tree
dt_params = {
'criterion':'gini',
'max_depth':5
}
# KNN
knn_params = {
'n_neighbors':5
}
# Random Forest parameters
rf_params = {
'n_jobs': -1,
'n_estimators': 500,
'criterion': 'gini',
'max_depth': 4,
#'min_samples_leaf': 2,
'warm_start': True,
'oob_score': True,
'verbose': 0
}
# Extra Trees Parameters
et_params = {
'n_jobs': -1,
'n_estimators': 800,
'max_depth': 6,
'min_samples_leaf': 2,
'verbose': 0
}
# AdaBoost parameters
ada_params = {
'n_estimators': 800,
'learning_rate' : 0.75
}
# Gradient Boosting parameters
gb_params = {
'n_estimators': 500,
'max_depth': 5,
'min_samples_leaf': 2,
'verbose': 0
}
# Support Vector Classifier parameters
svc_params = {
'kernel' : 'linear',
'C' : 1.0,
'probability': True
}
# Support Vector Classifier parameters
svcr_params = {
'kernel' : 'rbf',
'C' : 1.0,
'probability': True
}
# Bagging Classifier
bag_params = {
'n_estimators' : 500,
'oob_score': True
}
#XGBoost Classifier
xgbc_params = {
'n_estimators': 500,
'max_depth': 4,
'learning_rate': 0.05,
'nthread': -1
}
#Linear Discriminant Analysis
lda_params = {}
#Quadratic Discriminant Analysis
qda1_params = {
'reg_param': 0.8,
'tol': 0.00001
}
#Quadratic Discriminant Analysis
qda2_params = {
'reg_param': 0.6,
'tol': 0.0001
}
In [23]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
dt = SklearnHelper(clf=DecisionTreeClassifier, seed=SEED, params=dt_params, seed_flag=True)
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params, seed_flag=True)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params, seed_flag=True)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params, seed_flag=True)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params, seed_flag=True)
#knn = SklearnHelper(clf=KNeighborsClassifier, seed=SEED, params=knn_params)
#svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params, seed_flag=True)
#svcr = SklearnHelper(clf=SVC, seed=SEED, params=svcr_params, seed_flag=True)
#bag = SklearnHelper(clf=BaggingClassifier, seed=SEED, params=bag_params, seed_flag=True)
#xgbc = SklearnHelper(clf=XGBClassifier, seed=SEED, params=xgbc_params)
#lda = SklearnHelper(clf=LinearDiscriminantAnalysis, seed=SEED, params=lda_params)
#qda1 = SklearnHelper(clf=QuadraticDiscriminantAnalysis, seed=SEED, params=qda1_params)
#qda2 = SklearnHelper(clf=QuadraticDiscriminantAnalysis, seed=SEED, params=qda2_params)
In [24]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = y_train.ravel()
#y.ravel()
#x_train = X.values # Creates an array of the train data
#x_test = test.values # Creats an array of the test data
#STD dataset:
x_train = X_train_std
x_test = test_std
In [25]:
# Create our OOF train and test predictions. These base results will be used as new features
dt_oof_train, dt_oof_test = get_oof(dt, x_train, y_train, x_test) # Decision Tree
rf_oof_train, rf_oof_test = get_oof(rf, x_train, y_train, x_test) # Random Forest
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost
gb_oof_train, gb_oof_test = get_oof(gb, x_train, y_train, x_test) # Gradient Boost
#knn_oof_train, knn_oof_test = get_oof(knn, x_train, y_train, x_test) # KNeighbors
#svc_oof_train, svc_oof_test = get_oof(svc, x_train, y_train, x_test) # SVM-l
#svcr_oof_train, svcr_oof_test = get_oof(svcr, x_train, y_train, x_test) # SVM-r
#bag_oof_train, bag_oof_test = get_oof(bag, x_train, y_train, x_test) # Bagging
#xgbc_oof_train, xgbc_oof_test = get_oof(xgbc, x_train, y_train, x_test) # XGBoost
#lda_oof_train, lda_oof_test = get_oof(lda, x_train, y_train, x_test) # Linear Discriminant Analysis
#qda1_oof_train, qda1_oof_test = get_oof(qda1, x_train, y_train, x_test) # Quadratic Discriminant Analysis
#qda2_oof_train, qda2_oof_test = get_oof(qda2, x_train, y_train, x_test) # Quadratic Discriminant Analysis
In [26]:
dt_features = dt.feature_importances(x_train,y_train)
rf_features = rf.feature_importances(x_train,y_train)
et_features = et.feature_importances(x_train, y_train)
ada_features = ada.feature_importances(x_train, y_train)
gb_features = gb.feature_importances(x_train,y_train)
In [27]:
cols = X.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
'Decision Tree': dt_features,
'Random Forest': rf_features,
'Extra Trees': et_features,
'AdaBoost': ada_features,
'Gradient Boost': gb_features
})
In [28]:
# Create the new column containing the average of values
feature_dataframe['mean'] = np.mean(feature_dataframe, axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe
Out[28]:
In [29]:
#First-level output as new features
base_predictions_train = pd.DataFrame({
'DecisionTree': dt_oof_train.ravel(),
'RandomForest': rf_oof_train.ravel(),
'ExtraTrees': et_oof_train.ravel(),
'AdaBoost': ada_oof_train.ravel(),
'GradientBoost': gb_oof_train.ravel(),
'type': y_train
})
base_predictions_train.head()
Out[29]:
In [30]:
x_train = np.concatenate(( dt_oof_train,
rf_oof_train,
et_oof_train,
ada_oof_train,
gb_oof_train
), axis=1)
x_test = np.concatenate(( dt_oof_test,
rf_oof_test,
et_oof_test,
ada_oof_test,
gb_oof_test
), axis=1)
In [31]:
#Second level learning model
import xgboost as xgb
l2_gbm = xgb.XGBClassifier(
learning_rate = 0.05,
n_estimators= 2000,
max_depth= 4,
gamma=0.9,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread= -1
).fit(x_train, y_train)
In [32]:
#level-2 CV: x_train, y_train
from sklearn import metrics
print(metrics.classification_report(y_train, l2_gbm.predict(x_train)))
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=None, shuffle=True)
estimator = l2_gbm
#plot_learning_curve(estimator, "level2 - XGBoost", x_train, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 8))
In [33]:
#level2 - XGB
l2_gbm_pred = l2_gbm.predict(x_test)
metrics.precision_recall_fscore_support(y_train, l2_gbm.predict(x_train), average='weighted')
print(l2_gbm_pred)