In [1]:
!pip3 install matplotlib
In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from sklearn import preprocessing
rcParams['figure.figsize'] = 12, 4
In [3]:
train = pd.read_csv('Train_nyOWmfK.csv', encoding='latin_1')
train = train.drop(['City', 'DOB', 'EMI_Loan_Submitted', 'Employer_Name', 'Interest_Rate'], axis=1)
train = train.drop(['Lead_Creation_Date', 'LoggedIn', 'Salary_Account', 'Var1', 'Filled_Form'], axis=1)
train = train.drop(['Device_Type', 'Var2', 'Mobile_Verified', 'Source', 'Gender'], axis=1)
train = train.fillna(1)
target = 'Disbursed'
IDcol = 'ID'
train.head(5)
Out[3]:
In [4]:
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(dtest[predictors])
dtrain_predprob = alg.predict_proba(dtest[predictors])[:,1]
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(dtest['Disbursed'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtest['Disbursed'], dtrain_predprob))
feat_imp = pd.Series(alg.get_booster().get_score(importance_type='weight')).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [5]:
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
In [6]:
traink = pd.read_csv('kyoto_df_1.csv')
traink = traink.drop(['Unnamed: 0', 'userid', 'Readtime(seconds)'], axis=1)
traink.rename(index=str, columns={"class": "Disbursed"}, inplace=True)
#x = traink.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x)
#traink = pd.DataFrame(x_scaled)
color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='Gray')
traink.plot.box(color=color, sym='r+', figsize=(20, 10), logy=True)
traink.head(5)
Out[6]:
In [7]:
testk = pd.read_csv('kyoto_df_2.csv')
testk = testk.drop(['Unnamed: 0', 'userid'], axis=1)
testk.rename(index=str, columns={"class": "Disbursed"}, inplace=True)
#x = testk.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x)
#testk = pd.DataFrame(x_scaled)
color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='Gray')
testk.plot.box(color=color, sym='r+', figsize=(20, 10), logy=True)
testk.sample(5)
Out[7]:
In [8]:
predictors = list(traink)[0:-2]
predictors
Out[8]:
In [9]:
modelfit(xgb1, traink, traink, predictors, cv_folds=10)
In [10]:
modelfit(xgb1, traink, testk, predictors, cv_folds=10)
In [11]:
modelfit(xgb1, testk, testk, predictors, cv_folds=10)
In [12]:
modelfit(xgb1, testk, traink, predictors, cv_folds=10)