In [1]:
!pip3 install matplotlib


Requirement already satisfied (use --upgrade to upgrade): matplotlib in /usr/local/lib/python3.5/dist-packages
Requirement already satisfied (use --upgrade to upgrade): pytz in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): kiwisolver>=1.0.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): six>=1.10 in /usr/lib/python3/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): cycler>=0.10 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): setuptools in /usr/lib/python3/dist-packages (from kiwisolver>=1.0.1->matplotlib)
You are using pip version 8.1.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from sklearn import preprocessing

rcParams['figure.figsize'] = 12, 4


/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/usr/local/lib/python3.5/dist-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [3]:
train = pd.read_csv('Train_nyOWmfK.csv', encoding='latin_1')
train = train.drop(['City', 'DOB', 'EMI_Loan_Submitted', 'Employer_Name', 'Interest_Rate'], axis=1)
train = train.drop(['Lead_Creation_Date', 'LoggedIn', 'Salary_Account', 'Var1', 'Filled_Form'], axis=1)
train = train.drop(['Device_Type', 'Var2', 'Mobile_Verified', 'Source', 'Gender'], axis=1)
train = train.fillna(1)
target = 'Disbursed'
IDcol = 'ID'
train.head(5)


Out[3]:
ID Monthly_Income Loan_Amount_Applied Loan_Tenure_Applied Existing_EMI Var5 Loan_Amount_Submitted Loan_Tenure_Submitted Processing_Fee Var4 Disbursed
0 ID000002C20 20000 300000.0 5.0 0.0 0 1.0 1.0 1.0 1 0
1 ID000004E40 35000 200000.0 2.0 0.0 13 200000.0 2.0 1.0 3 0
2 ID000007H20 22500 600000.0 4.0 0.0 0 450000.0 4.0 1.0 1 0
3 ID000008I30 35000 1000000.0 5.0 0.0 10 920000.0 5.0 1.0 3 0
4 ID000009J40 100000 500000.0 2.0 25000.0 17 500000.0 2.0 1.0 3 0

In [4]:
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtest[predictors])
    dtrain_predprob = alg.predict_proba(dtest[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtest['Disbursed'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtest['Disbursed'], dtrain_predprob))
     
    feat_imp = pd.Series(alg.get_booster().get_score(importance_type='weight')).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [5]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [6]:
traink = pd.read_csv('kyoto_df_1.csv')
traink = traink.drop(['Unnamed: 0', 'userid', 'Readtime(seconds)'], axis=1)
traink.rename(index=str, columns={"class": "Disbursed"}, inplace=True)

#x = traink.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x)
#traink = pd.DataFrame(x_scaled)

color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='Gray')
traink.plot.box(color=color, sym='r+', figsize=(20, 10), logy=True)
traink.head(5)


/usr/local/lib/python3.5/dist-packages/matplotlib/ticker.py:2198: UserWarning: Data has no positive values, and therefore cannot be log-scaled.
  "Data has no positive values, and therefore cannot be "
Out[6]:
bookmarkc closec markerc memoc mobilec openc pcc tabletc watchc SEARCH ... Page_JumpC Add_BookmarkC Delete_BookmarkC Add_MemoC Delete_MemoC Change_MemoC Add_MarkerC Delete_MarkerC Readpages Disbursed
0 0 3 0 0 0 27 1240 66 1306 0 ... 65 0 0 0 0 0 0 0 1211 False
1 0 0 0 0 0 1 8 0 8 0 ... 0 0 0 0 0 0 0 0 7 False
2 2 1 2 0 45 5 409 0 454 0 ... 1 1 1 0 0 0 2 0 443 False
3 2 0 2 2 0 4 89 0 89 0 ... 2 1 1 1 0 0 1 1 77 False
4 0 3 0 0 0 5 540 0 540 0 ... 2 0 0 0 0 0 0 0 530 True

5 rows × 26 columns


In [7]:
testk = pd.read_csv('kyoto_df_2.csv')
testk = testk.drop(['Unnamed: 0', 'userid'], axis=1)
testk.rename(index=str, columns={"class": "Disbursed"}, inplace=True)

#x = testk.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x)
#testk = pd.DataFrame(x_scaled)

color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='Gray')
testk.plot.box(color=color, sym='r+', figsize=(20, 10), logy=True)
testk.sample(5)


Out[7]:
bookmarkc closec markerc memoc mobilec openc pcc tabletc watchc SEARCH ... Page_JumpC Add_BookmarkC Delete_BookmarkC Add_MemoC Delete_MemoC Change_MemoC Add_MarkerC Delete_MarkerC Readpages Disbursed
54 2 5 0 0 0 6 314 0 314 0 ... 0 1 1 0 0 0 0 0 301 True
10 0 12 0 4 0 12 439 0 439 0 ... 2 0 0 2 0 2 0 0 409 True
44 0 2 0 0 0 6 835 0 835 0 ... 38 0 0 0 0 0 0 0 789 False
36 0 1 0 0 0 1 9 0 9 0 ... 0 0 0 0 0 0 0 0 7 False
53 0 2 0 0 0 5 384 0 384 0 ... 12 0 0 0 0 0 0 0 365 True

5 rows × 27 columns


In [8]:
predictors = list(traink)[0:-2]
predictors


Out[8]:
['bookmarkc',
 'closec',
 'markerc',
 'memoc',
 'mobilec',
 'openc',
 'pcc',
 'tabletc',
 'watchc',
 'SEARCH',
 'PREV',
 'NEXT',
 'BacktrackRate(PrevC/NextC)',
 'LINK_CLICK',
 'SEARCH_JUMP',
 'JUMPC',
 'Page_JumpC',
 'Add_BookmarkC',
 'Delete_BookmarkC',
 'Add_MemoC',
 'Delete_MemoC',
 'Change_MemoC',
 'Add_MarkerC',
 'Delete_MarkerC']

Evaluation


In [9]:
modelfit(xgb1, traink, traink, predictors, cv_folds=10)


/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
Model Report
Accuracy : 0.8113
AUC Score (Train): 0.886752

In [10]:
modelfit(xgb1, traink, testk, predictors, cv_folds=10)


/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
Model Report
Accuracy : 0.6
AUC Score (Train): 0.601223

In [11]:
modelfit(xgb1, testk, testk, predictors, cv_folds=10)


Model Report
Accuracy : 0.8909
AUC Score (Train): 0.932745
/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

In [12]:
modelfit(xgb1, testk, traink, predictors, cv_folds=10)


Model Report
Accuracy : 0.434
AUC Score (Train): 0.481481
/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff: