notebook.community

Edit and run



In [1]:

    
!pip3 install matplotlib









    



Requirement already satisfied (use --upgrade to upgrade): matplotlib in /usr/local/lib/python3.5/dist-packages
Requirement already satisfied (use --upgrade to upgrade): pytz in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): kiwisolver>=1.0.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): six>=1.10 in /usr/lib/python3/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2.1 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): cycler>=0.10 in /usr/local/lib/python3.5/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): setuptools in /usr/lib/python3/dist-packages (from kiwisolver>=1.0.1->matplotlib)
You are using pip version 8.1.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.



In [2]:

    
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from sklearn import preprocessing

rcParams['figure.figsize'] = 12, 4









    



/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/usr/local/lib/python3.5/dist-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)



In [3]:

    
train = pd.read_csv('Train_nyOWmfK.csv', encoding='latin_1')
train = train.drop(['City', 'DOB', 'EMI_Loan_Submitted', 'Employer_Name', 'Interest_Rate'], axis=1)
train = train.drop(['Lead_Creation_Date', 'LoggedIn', 'Salary_Account', 'Var1', 'Filled_Form'], axis=1)
train = train.drop(['Device_Type', 'Var2', 'Mobile_Verified', 'Source', 'Gender'], axis=1)
train = train.fillna(1)
target = 'Disbursed'
IDcol = 'ID'
train.head(5)









    Out[3]:







  
    
      
      ID
      Monthly_Income
      Loan_Amount_Applied
      Loan_Tenure_Applied
      Existing_EMI
      Var5
      Loan_Amount_Submitted
      Loan_Tenure_Submitted
      Processing_Fee
      Var4
      Disbursed
    
  
  
    
      0
      ID000002C20
      20000
      300000.0
      5.0
      0.0
      0
      1.0
      1.0
      1.0
      1
      0
    
    
      1
      ID000004E40
      35000
      200000.0
      2.0
      0.0
      13
      200000.0
      2.0
      1.0
      3
      0
    
    
      2
      ID000007H20
      22500
      600000.0
      4.0
      0.0
      0
      450000.0
      4.0
      1.0
      1
      0
    
    
      3
      ID000008I30
      35000
      1000000.0
      5.0
      0.0
      10
      920000.0
      5.0
      1.0
      3
      0
    
    
      4
      ID000009J40
      100000
      500000.0
      2.0
      25000.0
      17
      500000.0
      2.0
      1.0
      3
      0



In [4]:

    
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtest[predictors])
    dtrain_predprob = alg.predict_proba(dtest[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtest['Disbursed'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtest['Disbursed'], dtrain_predprob))
     
    feat_imp = pd.Series(alg.get_booster().get_score(importance_type='weight')).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')



In [5]:

    
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)



In [6]:

    
traink = pd.read_csv('kyoto_df_1.csv')
traink = traink.drop(['Unnamed: 0', 'userid', 'Readtime(seconds)'], axis=1)
traink.rename(index=str, columns={"class": "Disbursed"}, inplace=True)

#x = traink.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x)
#traink = pd.DataFrame(x_scaled)

color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='Gray')
traink.plot.box(color=color, sym='r+', figsize=(20, 10), logy=True)
traink.head(5)









    



/usr/local/lib/python3.5/dist-packages/matplotlib/ticker.py:2198: UserWarning: Data has no positive values, and therefore cannot be log-scaled.
  "Data has no positive values, and therefore cannot be "






    Out[6]:







  
    
      
      bookmarkc
      closec
      markerc
      memoc
      mobilec
      openc
      pcc
      tabletc
      watchc
      SEARCH
      ...
      Page_JumpC
      Add_BookmarkC
      Delete_BookmarkC
      Add_MemoC
      Delete_MemoC
      Change_MemoC
      Add_MarkerC
      Delete_MarkerC
      Readpages
      Disbursed
    
  
  
    
      0
      0
      3
      0
      0
      0
      27
      1240
      66
      1306
      0
      ...
      65
      0
      0
      0
      0
      0
      0
      0
      1211
      False
    
    
      1
      0
      0
      0
      0
      0
      1
      8
      0
      8
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      7
      False
    
    
      2
      2
      1
      2
      0
      45
      5
      409
      0
      454
      0
      ...
      1
      1
      1
      0
      0
      0
      2
      0
      443
      False
    
    
      3
      2
      0
      2
      2
      0
      4
      89
      0
      89
      0
      ...
      2
      1
      1
      1
      0
      0
      1
      1
      77
      False
    
    
      4
      0
      3
      0
      0
      0
      5
      540
      0
      540
      0
      ...
      2
      0
      0
      0
      0
      0
      0
      0
      530
      True
    
  

5 rows × 26 columns



In [7]:

    
testk = pd.read_csv('kyoto_df_2.csv')
testk = testk.drop(['Unnamed: 0', 'userid'], axis=1)
testk.rename(index=str, columns={"class": "Disbursed"}, inplace=True)

#x = testk.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x)
#testk = pd.DataFrame(x_scaled)

color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='Gray')
testk.plot.box(color=color, sym='r+', figsize=(20, 10), logy=True)
testk.sample(5)









    Out[7]:







  
    
      
      bookmarkc
      closec
      markerc
      memoc
      mobilec
      openc
      pcc
      tabletc
      watchc
      SEARCH
      ...
      Page_JumpC
      Add_BookmarkC
      Delete_BookmarkC
      Add_MemoC
      Delete_MemoC
      Change_MemoC
      Add_MarkerC
      Delete_MarkerC
      Readpages
      Disbursed
    
  
  
    
      54
      2
      5
      0
      0
      0
      6
      314
      0
      314
      0
      ...
      0
      1
      1
      0
      0
      0
      0
      0
      301
      True
    
    
      10
      0
      12
      0
      4
      0
      12
      439
      0
      439
      0
      ...
      2
      0
      0
      2
      0
      2
      0
      0
      409
      True
    
    
      44
      0
      2
      0
      0
      0
      6
      835
      0
      835
      0
      ...
      38
      0
      0
      0
      0
      0
      0
      0
      789
      False
    
    
      36
      0
      1
      0
      0
      0
      1
      9
      0
      9
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      7
      False
    
    
      53
      0
      2
      0
      0
      0
      5
      384
      0
      384
      0
      ...
      12
      0
      0
      0
      0
      0
      0
      0
      365
      True
    
  

5 rows × 27 columns



In [8]:

    
predictors = list(traink)[0:-2]
predictors









    Out[8]:





['bookmarkc',
 'closec',
 'markerc',
 'memoc',
 'mobilec',
 'openc',
 'pcc',
 'tabletc',
 'watchc',
 'SEARCH',
 'PREV',
 'NEXT',
 'BacktrackRate(PrevC/NextC)',
 'LINK_CLICK',
 'SEARCH_JUMP',
 'JUMPC',
 'Page_JumpC',
 'Add_BookmarkC',
 'Delete_BookmarkC',
 'Add_MemoC',
 'Delete_MemoC',
 'Change_MemoC',
 'Add_MarkerC',
 'Delete_MarkerC']

Evaluation



In [9]:

    
modelfit(xgb1, traink, traink, predictors, cv_folds=10)









    



/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:






    



Model Report
Accuracy : 0.8113
AUC Score (Train): 0.886752



In [10]:

    
modelfit(xgb1, traink, testk, predictors, cv_folds=10)









    



/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:






    



Model Report
Accuracy : 0.6
AUC Score (Train): 0.601223



In [11]:

    
modelfit(xgb1, testk, testk, predictors, cv_folds=10)









    



Model Report
Accuracy : 0.8909
AUC Score (Train): 0.932745






    



/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:



In [12]:

    
modelfit(xgb1, testk, traink, predictors, cv_folds=10)









    



Model Report
Accuracy : 0.434
AUC Score (Train): 0.481481






    



/usr/local/lib/python3.5/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

	ID	Monthly_Income	Loan_Amount_Applied	Loan_Tenure_Applied	Existing_EMI	Var5	Loan_Amount_Submitted	Loan_Tenure_Submitted	Processing_Fee	Var4
0	ID000002C20	20000	300000.0	5.0	0.0	0	1.0	1.0	1.0	1
1	ID000004E40	35000	200000.0	2.0	0.0	13	200000.0	2.0	1.0	3
2	ID000007H20	22500	600000.0	4.0	0.0	0	450000.0	4.0	1.0	1
3	ID000008I30	35000	1000000.0	5.0	0.0	10	920000.0	5.0	1.0	3
4	ID000009J40	100000	500000.0	2.0	25000.0	17	500000.0	2.0	1.0	3

	bookmarkc	closec	markerc	memoc	mobilec	openc	pcc	tabletc	watchc	...	Page_JumpC	Add_BookmarkC	Delete_BookmarkC	Add_MemoC	Add_MarkerC	Delete_MarkerC	Readpages	Disbursed
0	0	3	0	0	0	27	1240	66	1306	...	65	0	0	0	0	0	1211	False
1	0	0	0	0	0	1	8	0	8	...	0	0	0	0	0	0	7	False
2	2	1	2	0	45	5	409	0	454	...	1	1	1	0	2	0	443	False
3	2	0	2	2	0	4	89	0	89	...	2	1	1	1	1	1	77	False
4	0	3	0	0	0	5	540	0	540	...	2	0	0	0	0	0	530	True