In [350]:
# Imports
import pandas as pd
import numpy as np
# machine learning
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
# xgboost
import xgboost as xgb
# matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [351]:
# 自定義的function
# 算 accuracy, precision, recall
def performance(clf, X_train, Y_train, cv_num = 4):
scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='precision')
print "precision is {}".format(scores.mean())
scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='recall')
print "recall is {}".format(scores.mean())
scores = cross_val_score(clf, X_train, Y_train, cv=cv_num , scoring='accuracy')
print "accuracy is {}".format(scores.mean())
In [352]:
# get titanic & test csv files as a DataFrame
train = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/train.csv")
test = pd.read_csv("/Users/wy/notebook/kaggle_competitions/titanic/test.csv")
test_passengerId = test['PassengerId']
In [353]:
train.info()
print "--------------"
test.info()
In [354]:
#Combine into data:
train['source']= 'train'
test['source'] = 'test'
data=pd.concat([train, test],ignore_index=True)
data.shape
Out[354]:
In [355]:
# 稍微看一下 data長怎樣
data.head()
Out[355]:
In [356]:
data.apply(lambda x: sum(x.isnull()))
Out[356]:
In [357]:
var = ['Sex','Ticket','Cabin','Embarked']
for v in var:
print '\nFrequency count for variable %s'%v
print data[v].value_counts()
In [358]:
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(111)
ax = data.boxplot(column='Fare', by=['Embarked','Pclass'], ax=ax)
plt.axhline(y=80, color='green')
ax.set_title('', y=1.1)
data[data.Embarked.isnull()][['Fare', 'Pclass', 'Embarked']]
Out[358]:
In [359]:
data['Embarked'].fillna('C', inplace=True)
In [360]:
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(111)
data[(data.Pclass==3)&(data.Embarked=='S')].Fare.hist(bins=100, ax=ax)
data[data.Fare.isnull()][['Pclass', 'Fare', 'Embarked']]
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.title('Histogram of Fare, Plcass 3 and Embarked S')
data[data.Fare.isnull()][['Pclass', 'Fare', 'Embarked']]
Out[360]:
In [361]:
print ("The top 5 most common value of Fare")
data[(data.Pclass==3)&(data.Embarked=='S')].Fare.value_counts().head()
Out[361]:
In [362]:
data['Fare'].fillna(8.05, inplace=True)
In [363]:
data['Cabin_Missing'] = data['Cabin'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Cabin'].fillna('U0', inplace=True)
In [364]:
import re
data['Names'] = data['Name'].map(lambda x: len(re.split(' ', x)))
In [365]:
title = data['Name'].map(lambda x: re.compile(', (.*?)\.').findall(x)[0])
title[title=='Mme'] = 'Mrs'
title[title.isin(['Ms','Mlle'])] = 'Miss'
title[title.isin(['Don', 'Jonkheer'])] = 'Sir'
title[title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'
title[title.isin(['Capt', 'Col', 'Major', 'Dr', 'Officer', 'Rev'])] = 'Officer'
data['Title'] = title
del title
In [366]:
deck = data['Cabin'].map( lambda x : re.compile("([a-zA-Z]+)").search(x).group())
data['Deck'] = deck
del deck
In [367]:
data.head()
Out[367]:
In [368]:
checker = re.compile("([0-9]+)")
def roomNum(x):
nums = checker.search(x)
if nums:
return int(nums.group())+1
else:
return 1
rooms = data['Cabin'].map(lambda x: roomNum(x))
In [369]:
data['Cabin_Room'] = rooms / rooms.sum()
del checker, roomNum
In [370]:
data.head()
Out[370]:
In [371]:
data['Group_num'] = data['Parch'] + data['SibSp'] + 1
In [372]:
def groupSize(x):
if x > 4 :
return 'L'
elif x == 1 :
return 'S'
else:
return 'M'
group_size = data['Group_num'].map(lambda x: groupSize(x))
data['Group_size'] = group_size
In [373]:
data.head()
Out[373]:
In [374]:
data.dtypes
Out[374]:
In [375]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['Nor_Fare'] = pd.Series(scaler.fit_transform(data['Fare'].values.reshape(-1,1)).reshape(-1), index=data.index)
In [376]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Embarked','Sex','Deck','Group_size','Title']
for col in var_to_encode:
data[col] = le.fit_transform(data[col])
In [377]:
data = pd.get_dummies(data, columns=var_to_encode)
data.columns
Out[377]:
In [378]:
label_y = data[data['source'] == 'train']['Survived']
In [379]:
from sklearn.model_selection import train_test_split
data.drop(labels=['PassengerId', 'Name', 'Cabin', 'Survived', 'Ticket', 'Fare'], axis=1, inplace=True)
X = data[data['Age'].notnull()].drop(['Age','source'], axis=1)
y = data[data['Age'].notnull()].Age
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
def get_model(estimator, parameters, X_train, y_train, scoring):
model = GridSearchCV(estimator, param_grid=parameters, scoring=scoring)
model.fit(X_train, y_train)
return model.best_estimator_
import xgboost as xgb
XGB = xgb.XGBRegressor(max_depth=4, seed= 42)
scoring = make_scorer(mean_absolute_error, greater_is_better=False)
parameters = {'reg_alpha':np.linspace(0.1,1.0,5), 'reg_lambda': np.linspace(1.0,3.0,5)}
reg_xgb = get_model(XGB, parameters, X_train, y_train, scoring)
print (reg_xgb)
print ("Mean absolute error of test data: {}".format(mean_absolute_error(y_test, reg_xgb.predict(X_test))))
In [380]:
fig = plt.figure(figsize=(15, 6))
alpha = 0.5
data['Age'].value_counts().plot(kind='density', color='#FA2379', label='Before', alpha=alpha)
pred = reg_xgb.predict(data[data['Age'].isnull()].drop(['Age','source'], axis=1))
data.set_value(data['Age'].isnull(), 'Age', pred)
data['Age'].value_counts().plot(kind='density', label='After', alpha=alpha)
plt.xlabel('Age')
plt.title("What's the distribution of Age after predicting?" )
plt.legend(loc='best')
plt.grid()
In [381]:
# label_y
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']
train.drop('source',axis=1,inplace=True)
test.drop('source',axis=1,inplace=True)
In [382]:
def modelfit(alg, train, label_y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(train, label=label_y)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(train, label_y,eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(train)
dtrain_predprob = alg.predict_proba(train)[:,1]
#Print model report:
print "\nModel Report"
print "Accuracy : %.4g" % metrics.accuracy_score(label_y, dtrain_predictions)
print "AUC Score (Train): %f" % metrics.roc_auc_score(label_y, dtrain_predprob)
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [383]:
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
# scale_pos_weight=1,
seed=27)
modelfit(xgb1, train, label_y)
In [384]:
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test1 = {
'max_depth':range(3,10,2),
'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, seed=27),
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train,label_y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
Out[384]:
In [385]:
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test2 = {
'max_depth':[8,9,10,11,12],
'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4,seed=27),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train,label_y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[385]:
In [386]:
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test3 = {
'gamma':[i/10.0 for i in range(0,15)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4,seed=27),
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train,label_y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
Out[386]:
In [387]:
xgb2 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=10,
min_child_weight=5,
gamma=0.9,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
seed=27)
modelfit(xgb2, train, label_y)
In [388]:
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test4 = {
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train,label_y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
Out[388]:
In [389]:
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test5 = {
'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train,label_y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
Out[389]:
In [390]:
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test6 = {
'reg_alpha':[0, 0.01, 0.05, 0.1, 0.16, 0.19]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(train,label_y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_
Out[390]:
In [391]:
xgb3 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=10,
min_child_weight=5,
gamma=0.9,
subsample=0.7,
colsample_bytree=0.7,
reg_alpha=0.1,
objective= 'binary:logistic',
nthread=4,
seed=27)
modelfit(xgb3, train, label_y)
In [392]:
xgb4 = XGBClassifier(
learning_rate =0.01,
n_estimators=5000,
max_depth=10,
min_child_weight=5,
gamma=0.9,
subsample=0.7,
colsample_bytree=0.7,
reg_alpha=0.1,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb4, train, label_y)
In [393]:
test_predict = xgb4.predict(test)
In [394]:
submission = pd.DataFrame({
"PassengerId": test_passengerId,
"Survived": test_predict
})
In [402]:
submission['Survived'] = submission['Survived'].astype('int64')
In [404]:
submission.to_csv('/Users/wy/Desktop/titanic_xgboost2.csv', index=False)