notebook.community

Edit and run



In [69]:

    
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

training_data = pd.read_csv('../data/train_u6lujuX_CVtuZ9i.csv')
testing_data = pd.read_csv('../data/test_Y3wMUE5_7gLdaTN.csv')
%matplotlib inline
# training_data[training_data.apply(lambda x: x['Credit_History'] == 0 and x['Loan_Status'] == 'Y',axis = 1)]



In [70]:

    
training_data.head(10)









    Out[70]:






  
    
      
      Loan_ID
      Gender
      Married
      Dependents
      Education
      Self_Employed
      ApplicantIncome
      CoapplicantIncome
      LoanAmount
      Loan_Amount_Term
      Credit_History
      Property_Area
      Loan_Status
    
  
  
    
      0
      LP001002
      Male
      No
      0
      Graduate
      No
      5849
      0.0
      NaN
      360.0
      1.0
      Urban
      Y
    
    
      1
      LP001003
      Male
      Yes
      1
      Graduate
      No
      4583
      1508.0
      128.0
      360.0
      1.0
      Rural
      N
    
    
      2
      LP001005
      Male
      Yes
      0
      Graduate
      Yes
      3000
      0.0
      66.0
      360.0
      1.0
      Urban
      Y
    
    
      3
      LP001006
      Male
      Yes
      0
      Not Graduate
      No
      2583
      2358.0
      120.0
      360.0
      1.0
      Urban
      Y
    
    
      4
      LP001008
      Male
      No
      0
      Graduate
      No
      6000
      0.0
      141.0
      360.0
      1.0
      Urban
      Y
    
    
      5
      LP001011
      Male
      Yes
      2
      Graduate
      Yes
      5417
      4196.0
      267.0
      360.0
      1.0
      Urban
      Y
    
    
      6
      LP001013
      Male
      Yes
      0
      Not Graduate
      No
      2333
      1516.0
      95.0
      360.0
      1.0
      Urban
      Y
    
    
      7
      LP001014
      Male
      Yes
      3+
      Graduate
      No
      3036
      2504.0
      158.0
      360.0
      0.0
      Semiurban
      N
    
    
      8
      LP001018
      Male
      Yes
      2
      Graduate
      No
      4006
      1526.0
      168.0
      360.0
      1.0
      Urban
      Y
    
    
      9
      LP001020
      Male
      Yes
      1
      Graduate
      No
      12841
      10968.0
      349.0
      360.0
      1.0
      Semiurban
      N



In [71]:

    
training_data.describe()









    



/Users/zzhang/.virtualenvs/ds_py3_venv/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)






    Out[71]:






  
    
      
      ApplicantIncome
      CoapplicantIncome
      LoanAmount
      Loan_Amount_Term
      Credit_History
    
  
  
    
      count
      614.000000
      614.000000
      592.000000
      600.00000
      564.000000
    
    
      mean
      5403.459283
      1621.245798
      146.412162
      342.00000
      0.842199
    
    
      std
      6109.041673
      2926.248369
      85.587325
      65.12041
      0.364878
    
    
      min
      150.000000
      0.000000
      9.000000
      12.00000
      0.000000
    
    
      25%
      2877.500000
      0.000000
      NaN
      NaN
      NaN
    
    
      50%
      3812.500000
      1188.500000
      NaN
      NaN
      NaN
    
    
      75%
      5795.000000
      2297.250000
      NaN
      NaN
      NaN
    
    
      max
      81000.000000
      41667.000000
      700.000000
      480.00000
      1.000000



In [72]:

    
training_data.groupby(['Loan_Status']).describe()









    



/Users/zzhang/.virtualenvs/ds_py3_venv/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)






    Out[72]:






  
    
      
      
      ApplicantIncome
      CoapplicantIncome
      Credit_History
      LoanAmount
      Loan_Amount_Term
    
    
      Loan_Status
      
      
      
      
      
      
    
  
  
    
      N
      count
      192.000000
      192.000000
      179.000000
      181.000000
      186.000000
    
    
      mean
      5446.078125
      1877.807292
      0.541899
      151.220994
      344.064516
    
    
      std
      6819.558528
      4384.060103
      0.499639
      85.862783
      69.238921
    
    
      min
      150.000000
      0.000000
      0.000000
      9.000000
      36.000000
    
    
      25%
      2885.000000
      0.000000
      NaN
      NaN
      NaN
    
    
      50%
      3833.500000
      268.000000
      NaN
      NaN
      NaN
    
    
      75%
      5861.250000
      2273.750000
      NaN
      NaN
      NaN
    
    
      max
      81000.000000
      41667.000000
      1.000000
      570.000000
      480.000000
    
    
      Y
      count
      422.000000
      422.000000
      385.000000
      411.000000
      414.000000
    
    
      mean
      5384.068720
      1504.516398
      0.981818
      144.294404
      341.072464
    
    
      std
      5765.441615
      1924.754855
      0.133782
      85.484607
      63.247770
    
    
      min
      210.000000
      0.000000
      0.000000
      17.000000
      12.000000
    
    
      25%
      2877.500000
      0.000000
      NaN
      NaN
      NaN
    
    
      50%
      3812.500000
      1239.500000
      NaN
      NaN
      NaN
    
    
      75%
      5771.500000
      2297.250000
      NaN
      NaN
      NaN
    
    
      max
      63337.000000
      20000.000000
      1.000000
      700.000000
      480.000000



In [73]:

    
training_data.isnull().sum(axis=0)









    Out[73]:





Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64



In [74]:

    
testing_data.isnull().sum()









    Out[74]:





Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64



In [75]:

    
training_data['Loan_Status'].value_counts()









    Out[75]:





Y    422
N    192
Name: Loan_Status, dtype: int64



In [76]:

    
credit_loan = pd.crosstab(training_data['Credit_History'], training_data['Loan_Status'], margins=False)
print(credit_loan)
credit_loan_rate = credit_loan.apply(lambda row: row/row.sum(), axis=1)
print(credit_loan_rate)
credit_loan.plot(kind='bar', color = ["red", "blue"])









    



Loan_Status      N    Y
Credit_History         
0.0             82    7
1.0             97  378
Loan_Status            N         Y
Credit_History                    
0.0             0.921348  0.078652
1.0             0.204211  0.795789






    Out[76]:





<matplotlib.axes._subplots.AxesSubplot at 0x112c10d30>



In [77]:

    
# training_data[training_data['Credit_History'].isnull()]



In [78]:

    
# testing_data[testing_data['Credit_History'].isnull()]



In [79]:

    
# dealing with three missing 'Married' value
training_data['Married'].value_counts()
pd.pivot_table(training_data, values ='LoanAmount', index=['Education'], columns=['Married'], aggfunc=np.median)









    Out[79]:






  
    
      Married
      No
      Yes
    
    
      Education
      
      
    
  
  
    
      Graduate
      118.0
      137.5
    
    
      Not Graduate
      110.0
      121.5



In [80]:

    
training_data[training_data['Dependents'].isnull()]









    Out[80]:






  
    
      
      Loan_ID
      Gender
      Married
      Dependents
      Education
      Self_Employed
      ApplicantIncome
      CoapplicantIncome
      LoanAmount
      Loan_Amount_Term
      Credit_History
      Property_Area
      Loan_Status
    
  
  
    
      102
      LP001350
      Male
      Yes
      NaN
      Graduate
      No
      13650
      0.0
      NaN
      360.0
      1.0
      Urban
      Y
    
    
      104
      LP001357
      Male
      NaN
      NaN
      Graduate
      No
      3816
      754.0
      160.0
      360.0
      1.0
      Urban
      Y
    
    
      120
      LP001426
      Male
      Yes
      NaN
      Graduate
      No
      5667
      2667.0
      180.0
      360.0
      1.0
      Rural
      Y
    
    
      226
      LP001754
      Male
      Yes
      NaN
      Not Graduate
      Yes
      4735
      0.0
      138.0
      360.0
      1.0
      Urban
      N
    
    
      228
      LP001760
      Male
      NaN
      NaN
      Graduate
      No
      4758
      0.0
      158.0
      480.0
      1.0
      Semiurban
      Y
    
    
      293
      LP001945
      Female
      No
      NaN
      Graduate
      No
      5417
      0.0
      143.0
      480.0
      0.0
      Urban
      N
    
    
      301
      LP001972
      Male
      Yes
      NaN
      Not Graduate
      No
      2875
      1750.0
      105.0
      360.0
      1.0
      Semiurban
      Y
    
    
      332
      LP002100
      Male
      No
      NaN
      Graduate
      No
      2833
      0.0
      71.0
      360.0
      1.0
      Urban
      Y
    
    
      335
      LP002106
      Male
      Yes
      NaN
      Graduate
      Yes
      5503
      4490.0
      70.0
      NaN
      1.0
      Semiurban
      Y
    
    
      346
      LP002130
      Male
      Yes
      NaN
      Not Graduate
      No
      3523
      3230.0
      152.0
      360.0
      0.0
      Rural
      N
    
    
      355
      LP002144
      Female
      No
      NaN
      Graduate
      No
      3813
      0.0
      116.0
      180.0
      1.0
      Urban
      Y
    
    
      435
      LP002393
      Female
      NaN
      NaN
      Graduate
      No
      10047
      0.0
      NaN
      240.0
      1.0
      Semiurban
      Y
    
    
      517
      LP002682
      Male
      Yes
      NaN
      Not Graduate
      No
      3074
      1800.0
      123.0
      360.0
      0.0
      Semiurban
      N
    
    
      571
      LP002847
      Male
      Yes
      NaN
      Graduate
      No
      5116
      1451.0
      165.0
      360.0
      0.0
      Urban
      N
    
    
      597
      LP002943
      Male
      No
      NaN
      Graduate
      No
      2987
      0.0
      88.0
      360.0
      0.0
      Semiurban
      N



In [81]:

    
loanAmount_pivot = pd.pivot_table(training_data, values='LoanAmount', index=['Gender', 'Self_Employed'], columns=['Education'], aggfunc=np.median)
loanAmount_pivot['Graduate']['Female']['No']









    Out[81]:





112.5



In [82]:

    
# Fill in Gender
# one assumption: if CoapplicantIncome == 0 and married , then Gender == male. case housewife
training_data[(training_data['CoapplicantIncome'] == 0) & (training_data['Married'] == 'No') 
             & (training_data['Self_Employed'] == 'Yes')]['Gender'].value_counts()









    Out[82]:





Male      12
Female    10
Name: Gender, dtype: int64



In [83]:

    
training_data['Loan_Amount_Term'].value_counts()
testing_data.count()









    Out[83]:





Loan_ID              367
Gender               356
Married              367
Dependents           357
Education            367
Self_Employed        344
ApplicantIncome      367
CoapplicantIncome    367
LoanAmount           362
Loan_Amount_Term     361
Credit_History       338
Property_Area        367
dtype: int64



In [84]:

    
training_data['Dependents'].value_counts()









    Out[84]:





0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64



In [85]:

    
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer

def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

def fill_nans(X, y=None):
    df = X.copy()
    
    df.drop('Loan_ID', axis=1, inplace=True)
    df['Married'] = df.apply(lambda row: 'Yes' if row['CoapplicantIncome'] == 0 else 'No', axis =1) 
    df['Dependents'].fillna(0.0, inplace=True)
    df['Gender'].fillna('Male', inplace=True)
    df['Self_Employed'].fillna('No', inplace=True)
    df['LoanAmount'] = df.apply(lambda row: loanAmount_pivot[row['Education']][row['Gender']][row['Married']] if row['LoanAmount'] else row['LoanAmount'],axis=1)
    df['Loan_Amount_Term'].fillna(360.0, inplace=True)
    df['Credit_History'].fillna(0.0, inplace=True)
    
    return df


def data_process(df):
    df['Credit_History'] = df['Credit_History'].fillna(1.0)
    df['Education'] = df['Education'].apply(lambda edu: 1 if edu == 'Graduate' else 0)
    df['Gender'] = df['Gender'].apply(lambda gender: 0 if gender == 'Female' else 1)
    df['Married'] = df['Married'].apply(lambda gender: 1 if gender == 'Yes' else 1) 
    df['Self_Employed'] = df['Self_Employed'].apply(lambda self_employed: 1 if self_employed == 'Yes' else 0)
#     df['Loan_Status'] = df['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)
    df['MonthPayment'] = df['LoanAmount'] / df['Loan_Amount_Term']
    df['Dependents'] = df['Dependents'].apply(lambda deps: 3 if deps == '3+' else int(deps))
    df = encode_onehot(df, ['Property_Area'])
    return df

fillna_transformer = FunctionTransformer(fill_nans, validate=False)
# training_df = fillna_transformer.transform(training_data)
# training_df.isnull().sum()



In [86]:

    
train_df = fill_nans(training_data.drop(['Loan_Status'], axis =1))
test_df = fill_nans(testing_data)
train_df.isnull().sum()









    Out[86]:





Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64



In [87]:

    
train_df = data_process(train_df)
test_df=data_process(test_df)
train_df.dtypes









    Out[87]:





Gender                       int64
Married                      int64
Dependents                   int64
Education                    int64
Self_Employed                int64
ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
MonthPayment               float64
Property_Area=Rural        float64
Property_Area=Semiurban    float64
Property_Area=Urban        float64
dtype: object



In [88]:

    
# plt.boxplot(train_df['MonthPayment'])
train_df['Property_Area=Urban'].value_counts()









    Out[88]:





0.0    412
1.0    202
Name: Property_Area=Urban, dtype: int64



In [89]:

    
# from sklearn.pipeline import Pipeline

# FEATURE_COLUMNS = ['Credit_History', 'Gender', 'Married', 'Loan_Status']

# def select_columns(X):
#     return X[FEATURE_COLUMNS]

# select_transformer = FunctionTransformer(select_columns, validate=False)

# stage = {('fillna', fillna_transformer)}#, ('selectColumns', select_transformer)}


# feature_process = Pipeline(stage)
# training_features = feature_process.transform(training_data)
# training_features



In [90]:

    
label = training_data['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)

FEATURE_COLUMNS = ['Gender', 'Married', 'Dependents',
                   'Education', 'Self_Employed', 'ApplicantIncome',
                   'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
                   'Credit_History', 'MonthPayment',
                   'Property_Area=Urban','Property_Area=Rural','Property_Area=Semiurban']

X_train, X_test, y_train, y_test = train_test_split(train_df[FEATURE_COLUMNS], label, test_size=0.1, random_state=1)

def display_result(y_true_, y_pred_):
    accuracy = accuracy_score(y_true=y_true_, y_pred=y_pred_)
    conf_matrix = confusion_matrix(y_true=y_true_, y_pred=y_pred_)
    print("Classification Accuracy: {}".format(accuracy))
    print("Confusion matrix:\n{}".format(conf_matrix))
    
def write_to_csv(filename, loan_ids, predict_results, message=""):
    predict_label= pd.Series(predict_results).replace({0: 'N', 1: 'Y'})
    with open(filename, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(["Loan_ID", "Loan_Status"])
        writer.writerows(zip(loan_ids, predict_label))
    print(message)

Non-trained model



In [91]:

    
y_pred = train_df['Credit_History']
display_result(label, y_pred)

y_test_pred = [1]*367 #test_df['Credit_History']
write_to_csv('baseline.csv', testing_data['Loan_ID'], y_test_pred, 'baseline')









    



Classification Accuracy: 0.7703583061889251
Confusion matrix:
[[ 95  97]
 [ 44 378]]
baseline

LR Model



In [92]:

    
m1_clf = LogisticRegression(C=1)

m1_clf.fit(X_train, y_train)

y_pred = m1_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_pred)

y_pred = m1_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_pred)

# linear model is underfitting, since the dataset is not linearly separatable









    



Classification Accuracy: 0.7735507246376812
Confusion matrix:
[[ 85  87]
 [ 38 342]]
Classification Accuracy: 0.7580645161290323
Confusion matrix:
[[ 7 13]
 [ 2 40]]

Decision Tree



In [93]:

    
dt_clf = DecisionTreeClassifier()

dt_clf.fit(X_train, y_train)

# Model is overfitting
y_train_pred = dt_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)

y_test_pred = dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)

#Tunning parameter to prevent overfitting
parameters = {'max_depth': np.arange(3,9,1),
              'min_samples_leaf':np.arange(3,30,2)}
gs_clf = GridSearchCV(dt_clf, parameters, n_jobs=-1, cv=5, refit=True)
gs_clf.fit(X_train, y_train)
gs_clf.grid_scores_

best_dt_clf = gs_clf.best_estimator_
print(best_dt_clf)
best_dt_pred = best_dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=best_dt_pred)

test_dt_pred = best_dt_clf.predict(test_df)
write_to_csv('dt_tuned.csv', testing_data['Loan_ID'], test_dt_pred, 'tuned dt model')









    



Classification Accuracy: 1.0
Confusion matrix:
[[172   0]
 [  0 380]]
Classification Accuracy: 0.6935483870967742
Confusion matrix:
[[ 9 11]
 [ 8 34]]
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Classification Accuracy: 0.7580645161290323
Confusion matrix:
[[ 8 12]
 [ 3 39]]
tuned dt model



In [94]:

    
m2_clf = RandomForestClassifier(n_estimators=100)

m2_clf.fit(X_train, y_train)

print(m2_clf)

y_train_pred = m2_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)

y_test_pred = m2_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)









    



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Classification Accuracy: 1.0
Confusion matrix:
[[172   0]
 [  0 380]]
Classification Accuracy: 0.6935483870967742
Confusion matrix:
[[ 7 13]
 [ 6 36]]

Gradient Boosting Tree



In [95]:

    
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

#Tunning parameter to prevent overfitting
min_samples_split = 30
min_samples_leaf = 15
max_depth = 5
# max_features = 'sqrt'
subsample = 0.6

gbt_clf = GradientBoostingClassifier(min_samples_split=min_samples_split, 
                                     min_samples_leaf=min_samples_leaf, 
                                     max_depth=max_depth, 
#                                      max_features=max_features, 
                                     learning_rate=0.1,
                                     subsample=subsample, random_state=10)

parameters={'n_estimators':np.arange(5,80,5)}
gbt_search_1 = GridSearchCV(estimator=gbt_clf, param_grid=parameters, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_1.fit(X_train, y_train)
gbt_search_1.grid_scores_
# gbt_search_1.best_estimator_









    Out[95]:





[mean: 0.68838, std: 0.02151, params: {'n_estimators': 5},
 mean: 0.73364, std: 0.05421, params: {'n_estimators': 10},
 mean: 0.75893, std: 0.06495, params: {'n_estimators': 15},
 mean: 0.76065, std: 0.07040, params: {'n_estimators': 20},
 mean: 0.76607, std: 0.07255, params: {'n_estimators': 25},
 mean: 0.76062, std: 0.07587, params: {'n_estimators': 30},
 mean: 0.75162, std: 0.07001, params: {'n_estimators': 35},
 mean: 0.75698, std: 0.07476, params: {'n_estimators': 40},
 mean: 0.75880, std: 0.07769, params: {'n_estimators': 45},
 mean: 0.74610, std: 0.07515, params: {'n_estimators': 50},
 mean: 0.74250, std: 0.07145, params: {'n_estimators': 55},
 mean: 0.74071, std: 0.06673, params: {'n_estimators': 60},
 mean: 0.74792, std: 0.07533, params: {'n_estimators': 65},
 mean: 0.74078, std: 0.06456, params: {'n_estimators': 70},
 mean: 0.73539, std: 0.06994, params: {'n_estimators': 75}]



In [96]:

    
n_estimators = 20
learning_rate = 0.1



In [97]:

    
param_test2 = {'max_depth':np.arange(3,8,1), 
               'min_samples_split':np.arange(10,40,5)}
gbt_search_2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
                                                                  n_estimators=20, 
                                                                  max_features='sqrt', 
                                                                  subsample=0.8, 
                                                                  random_state=10),
                           param_grid = param_test2,
                           scoring='accuracy',
                           n_jobs=4,iid=False, cv=10)
gbt_search_2.fit(X_train, y_train)
gbt_search_2.grid_scores_, gbt_search_2.best_params_, gbt_search_2.best_score_









    Out[97]:





([mean: 0.72643, std: 0.05599, params: {'max_depth': 3, 'min_samples_split': 10},
  mean: 0.72101, std: 0.06252, params: {'max_depth': 3, 'min_samples_split': 15},
  mean: 0.72455, std: 0.06578, params: {'max_depth': 3, 'min_samples_split': 20},
  mean: 0.72818, std: 0.06482, params: {'max_depth': 3, 'min_samples_split': 25},
  mean: 0.72818, std: 0.06168, params: {'max_depth': 3, 'min_samples_split': 30},
  mean: 0.72276, std: 0.05779, params: {'max_depth': 3, 'min_samples_split': 35},
  mean: 0.74451, std: 0.06937, params: {'max_depth': 4, 'min_samples_split': 10},
  mean: 0.72620, std: 0.07538, params: {'max_depth': 4, 'min_samples_split': 15},
  mean: 0.72987, std: 0.08348, params: {'max_depth': 4, 'min_samples_split': 20},
  mean: 0.73721, std: 0.07446, params: {'max_depth': 4, 'min_samples_split': 25},
  mean: 0.72994, std: 0.06415, params: {'max_depth': 4, 'min_samples_split': 30},
  mean: 0.72643, std: 0.05839, params: {'max_depth': 4, 'min_samples_split': 35},
  mean: 0.74455, std: 0.07252, params: {'max_depth': 5, 'min_samples_split': 10},
  mean: 0.74081, std: 0.07110, params: {'max_depth': 5, 'min_samples_split': 15},
  mean: 0.74263, std: 0.08071, params: {'max_depth': 5, 'min_samples_split': 20},
  mean: 0.74812, std: 0.07423, params: {'max_depth': 5, 'min_samples_split': 25},
  mean: 0.73552, std: 0.06606, params: {'max_depth': 5, 'min_samples_split': 30},
  mean: 0.74623, std: 0.05619, params: {'max_depth': 5, 'min_samples_split': 35},
  mean: 0.74273, std: 0.07436, params: {'max_depth': 6, 'min_samples_split': 10},
  mean: 0.74802, std: 0.07248, params: {'max_depth': 6, 'min_samples_split': 15},
  mean: 0.72630, std: 0.06577, params: {'max_depth': 6, 'min_samples_split': 20},
  mean: 0.74990, std: 0.08292, params: {'max_depth': 6, 'min_samples_split': 25},
  mean: 0.74812, std: 0.06916, params: {'max_depth': 6, 'min_samples_split': 30},
  mean: 0.76799, std: 0.07645, params: {'max_depth': 6, 'min_samples_split': 35},
  mean: 0.73172, std: 0.06988, params: {'max_depth': 7, 'min_samples_split': 10},
  mean: 0.75166, std: 0.07262, params: {'max_depth': 7, 'min_samples_split': 15},
  mean: 0.72994, std: 0.08614, params: {'max_depth': 7, 'min_samples_split': 20},
  mean: 0.73714, std: 0.08082, params: {'max_depth': 7, 'min_samples_split': 25},
  mean: 0.74078, std: 0.06983, params: {'max_depth': 7, 'min_samples_split': 30},
  mean: 0.75166, std: 0.06984, params: {'max_depth': 7, 'min_samples_split': 35}],
 {'max_depth': 6, 'min_samples_split': 35},
 0.76798701298701288)



In [98]:

    
max_depth = 5
min_samples_split= 30



In [99]:

    
param_test3 = {#'min_samples_split':np.arange(10,80,5), 
               'min_samples_leaf':np.arange(5,30,5)}
gbt_search_3 = GridSearchCV(GradientBoostingClassifier(learning_rate=0.1,
                                                        n_estimators=20, 
                                                        max_features='sqrt', 
                                                        subsample=0.8, 
                                                        random_state=10,
                                                        max_depth = 5,
                                                        min_samples_split= 30),
                             param_grid = param_test3, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_3.fit(X_train, y_train)
gbt_search_3.grid_scores_, gbt_search_3.best_params_, gbt_search_3.best_score_









    Out[99]:





([mean: 0.73912, std: 0.06715, params: {'min_samples_leaf': 5},
  mean: 0.74451, std: 0.06439, params: {'min_samples_leaf': 10},
  mean: 0.74094, std: 0.06947, params: {'min_samples_leaf': 15},
  mean: 0.72289, std: 0.06210, params: {'min_samples_leaf': 20},
  mean: 0.72279, std: 0.06766, params: {'min_samples_leaf': 25}],
 {'min_samples_leaf': 10},
 0.74451298701298707)

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	LP001002	Male	No	0	Graduate	No	5849	0.0	NaN	360.0	1.0	Urban	Y
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.0	360.0	1.0	Rural	N
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.0	360.0	1.0	Urban	Y
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.0	360.0	1.0	Urban	Y
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.0	360.0	1.0	Urban	Y
5	LP001011	Male	Yes	2	Graduate	Yes	5417	4196.0	267.0	360.0	1.0	Urban	Y
6	LP001013	Male	Yes	0	Not Graduate	No	2333	1516.0	95.0	360.0	1.0	Urban	Y
7	LP001014	Male	Yes	3+	Graduate	No	3036	2504.0	158.0	360.0	0.0	Semiurban	N
8	LP001018	Male	Yes	2	Graduate	No	4006	1526.0	168.0	360.0	1.0	Urban	Y
9	LP001020	Male	Yes	1	Graduate	No	12841	10968.0	349.0	360.0	1.0	Semiurban	N

	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History
count	614.000000	614.000000	592.000000	600.00000	564.000000
mean	5403.459283	1621.245798	146.412162	342.00000	0.842199
std	6109.041673	2926.248369	85.587325	65.12041	0.364878
min	150.000000	0.000000	9.000000	12.00000	0.000000
25%	2877.500000	0.000000	NaN	NaN	NaN
50%	3812.500000	1188.500000	NaN	NaN	NaN
75%	5795.000000	2297.250000	NaN	NaN	NaN
max	81000.000000	41667.000000	700.000000	480.00000	1.000000

		ApplicantIncome	CoapplicantIncome	Credit_History	LoanAmount	Loan_Amount_Term
Loan_Status
N	count	192.000000	192.000000	179.000000	181.000000	186.000000
	mean	5446.078125	1877.807292	0.541899	151.220994	344.064516
	std	6819.558528	4384.060103	0.499639	85.862783	69.238921
	min	150.000000	0.000000	0.000000	9.000000	36.000000
	25%	2885.000000	0.000000	NaN	NaN	NaN
	50%	3833.500000	268.000000	NaN	NaN	NaN
	75%	5861.250000	2273.750000	NaN	NaN	NaN
	max	81000.000000	41667.000000	1.000000	570.000000	480.000000
Y	count	422.000000	422.000000	385.000000	411.000000	414.000000
	mean	5384.068720	1504.516398	0.981818	144.294404	341.072464
	std	5765.441615	1924.754855	0.133782	85.484607	63.247770
	min	210.000000	0.000000	0.000000	17.000000	12.000000
	25%	2877.500000	0.000000	NaN	NaN	NaN
	50%	3812.500000	1239.500000	NaN	NaN	NaN
	75%	5771.500000	2297.250000	NaN	NaN	NaN
	max	63337.000000	20000.000000	1.000000	700.000000	480.000000

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
102	LP001350	Male	Yes	NaN	Graduate	No	13650	0.0	NaN	360.0	1.0	Urban	Y
104	LP001357	Male	NaN	NaN	Graduate	No	3816	754.0	160.0	360.0	1.0	Urban	Y
120	LP001426	Male	Yes	NaN	Graduate	No	5667	2667.0	180.0	360.0	1.0	Rural	Y
226	LP001754	Male	Yes	NaN	Not Graduate	Yes	4735	0.0	138.0	360.0	1.0	Urban	N
228	LP001760	Male	NaN	NaN	Graduate	No	4758	0.0	158.0	480.0	1.0	Semiurban	Y
293	LP001945	Female	No	NaN	Graduate	No	5417	0.0	143.0	480.0	0.0	Urban	N
301	LP001972	Male	Yes	NaN	Not Graduate	No	2875	1750.0	105.0	360.0	1.0	Semiurban	Y
332	LP002100	Male	No	NaN	Graduate	No	2833	0.0	71.0	360.0	1.0	Urban	Y
335	LP002106	Male	Yes	NaN	Graduate	Yes	5503	4490.0	70.0	NaN	1.0	Semiurban	Y
346	LP002130	Male	Yes	NaN	Not Graduate	No	3523	3230.0	152.0	360.0	0.0	Rural	N
355	LP002144	Female	No	NaN	Graduate	No	3813	0.0	116.0	180.0	1.0	Urban	Y
435	LP002393	Female	NaN	NaN	Graduate	No	10047	0.0	NaN	240.0	1.0	Semiurban	Y
517	LP002682	Male	Yes	NaN	Not Graduate	No	3074	1800.0	123.0	360.0	0.0	Semiurban	N
571	LP002847	Male	Yes	NaN	Graduate	No	5116	1451.0	165.0	360.0	0.0	Urban	N
597	LP002943	Male	No	NaN	Graduate	No	2987	0.0	88.0	360.0	0.0	Semiurban	N