In [69]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

training_data = pd.read_csv('../data/train_u6lujuX_CVtuZ9i.csv')
testing_data = pd.read_csv('../data/test_Y3wMUE5_7gLdaTN.csv')
%matplotlib inline
# training_data[training_data.apply(lambda x: x['Credit_History'] == 0 and x['Loan_Status'] == 'Y',axis = 1)]

In [70]:
training_data.head(10)


Out[70]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y
1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N
2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y
3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y
4 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y
5 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y
6 LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.0 360.0 1.0 Urban Y
7 LP001014 Male Yes 3+ Graduate No 3036 2504.0 158.0 360.0 0.0 Semiurban N
8 LP001018 Male Yes 2 Graduate No 4006 1526.0 168.0 360.0 1.0 Urban Y
9 LP001020 Male Yes 1 Graduate No 12841 10968.0 349.0 360.0 1.0 Semiurban N

In [71]:
training_data.describe()


/Users/zzhang/.virtualenvs/ds_py3_venv/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[71]:
ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
count 614.000000 614.000000 592.000000 600.00000 564.000000
mean 5403.459283 1621.245798 146.412162 342.00000 0.842199
std 6109.041673 2926.248369 85.587325 65.12041 0.364878
min 150.000000 0.000000 9.000000 12.00000 0.000000
25% 2877.500000 0.000000 NaN NaN NaN
50% 3812.500000 1188.500000 NaN NaN NaN
75% 5795.000000 2297.250000 NaN NaN NaN
max 81000.000000 41667.000000 700.000000 480.00000 1.000000

In [72]:
training_data.groupby(['Loan_Status']).describe()


/Users/zzhang/.virtualenvs/ds_py3_venv/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[72]:
ApplicantIncome CoapplicantIncome Credit_History LoanAmount Loan_Amount_Term
Loan_Status
N count 192.000000 192.000000 179.000000 181.000000 186.000000
mean 5446.078125 1877.807292 0.541899 151.220994 344.064516
std 6819.558528 4384.060103 0.499639 85.862783 69.238921
min 150.000000 0.000000 0.000000 9.000000 36.000000
25% 2885.000000 0.000000 NaN NaN NaN
50% 3833.500000 268.000000 NaN NaN NaN
75% 5861.250000 2273.750000 NaN NaN NaN
max 81000.000000 41667.000000 1.000000 570.000000 480.000000
Y count 422.000000 422.000000 385.000000 411.000000 414.000000
mean 5384.068720 1504.516398 0.981818 144.294404 341.072464
std 5765.441615 1924.754855 0.133782 85.484607 63.247770
min 210.000000 0.000000 0.000000 17.000000 12.000000
25% 2877.500000 0.000000 NaN NaN NaN
50% 3812.500000 1239.500000 NaN NaN NaN
75% 5771.500000 2297.250000 NaN NaN NaN
max 63337.000000 20000.000000 1.000000 700.000000 480.000000

In [73]:
training_data.isnull().sum(axis=0)


Out[73]:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [74]:
testing_data.isnull().sum()


Out[74]:
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [75]:
training_data['Loan_Status'].value_counts()


Out[75]:
Y    422
N    192
Name: Loan_Status, dtype: int64

In [76]:
credit_loan = pd.crosstab(training_data['Credit_History'], training_data['Loan_Status'], margins=False)
print(credit_loan)
credit_loan_rate = credit_loan.apply(lambda row: row/row.sum(), axis=1)
print(credit_loan_rate)
credit_loan.plot(kind='bar', color = ["red", "blue"])


Loan_Status      N    Y
Credit_History         
0.0             82    7
1.0             97  378
Loan_Status            N         Y
Credit_History                    
0.0             0.921348  0.078652
1.0             0.204211  0.795789
Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x112c10d30>

In [77]:
# training_data[training_data['Credit_History'].isnull()]

In [78]:
# testing_data[testing_data['Credit_History'].isnull()]

In [79]:
# dealing with three missing 'Married' value
training_data['Married'].value_counts()
pd.pivot_table(training_data, values ='LoanAmount', index=['Education'], columns=['Married'], aggfunc=np.median)


Out[79]:
Married No Yes
Education
Graduate 118.0 137.5
Not Graduate 110.0 121.5

In [80]:
training_data[training_data['Dependents'].isnull()]


Out[80]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
102 LP001350 Male Yes NaN Graduate No 13650 0.0 NaN 360.0 1.0 Urban Y
104 LP001357 Male NaN NaN Graduate No 3816 754.0 160.0 360.0 1.0 Urban Y
120 LP001426 Male Yes NaN Graduate No 5667 2667.0 180.0 360.0 1.0 Rural Y
226 LP001754 Male Yes NaN Not Graduate Yes 4735 0.0 138.0 360.0 1.0 Urban N
228 LP001760 Male NaN NaN Graduate No 4758 0.0 158.0 480.0 1.0 Semiurban Y
293 LP001945 Female No NaN Graduate No 5417 0.0 143.0 480.0 0.0 Urban N
301 LP001972 Male Yes NaN Not Graduate No 2875 1750.0 105.0 360.0 1.0 Semiurban Y
332 LP002100 Male No NaN Graduate No 2833 0.0 71.0 360.0 1.0 Urban Y
335 LP002106 Male Yes NaN Graduate Yes 5503 4490.0 70.0 NaN 1.0 Semiurban Y
346 LP002130 Male Yes NaN Not Graduate No 3523 3230.0 152.0 360.0 0.0 Rural N
355 LP002144 Female No NaN Graduate No 3813 0.0 116.0 180.0 1.0 Urban Y
435 LP002393 Female NaN NaN Graduate No 10047 0.0 NaN 240.0 1.0 Semiurban Y
517 LP002682 Male Yes NaN Not Graduate No 3074 1800.0 123.0 360.0 0.0 Semiurban N
571 LP002847 Male Yes NaN Graduate No 5116 1451.0 165.0 360.0 0.0 Urban N
597 LP002943 Male No NaN Graduate No 2987 0.0 88.0 360.0 0.0 Semiurban N

In [81]:
loanAmount_pivot = pd.pivot_table(training_data, values='LoanAmount', index=['Gender', 'Self_Employed'], columns=['Education'], aggfunc=np.median)
loanAmount_pivot['Graduate']['Female']['No']


Out[81]:
112.5

In [82]:
# Fill in Gender
# one assumption: if CoapplicantIncome == 0 and married , then Gender == male. case housewife
training_data[(training_data['CoapplicantIncome'] == 0) & (training_data['Married'] == 'No') 
             & (training_data['Self_Employed'] == 'Yes')]['Gender'].value_counts()


Out[82]:
Male      12
Female    10
Name: Gender, dtype: int64

In [83]:
training_data['Loan_Amount_Term'].value_counts()
testing_data.count()


Out[83]:
Loan_ID              367
Gender               356
Married              367
Dependents           357
Education            367
Self_Employed        344
ApplicantIncome      367
CoapplicantIncome    367
LoanAmount           362
Loan_Amount_Term     361
Credit_History       338
Property_Area        367
dtype: int64

In [84]:
training_data['Dependents'].value_counts()


Out[84]:
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [85]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer

def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

def fill_nans(X, y=None):
    df = X.copy()
    
    df.drop('Loan_ID', axis=1, inplace=True)
    df['Married'] = df.apply(lambda row: 'Yes' if row['CoapplicantIncome'] == 0 else 'No', axis =1) 
    df['Dependents'].fillna(0.0, inplace=True)
    df['Gender'].fillna('Male', inplace=True)
    df['Self_Employed'].fillna('No', inplace=True)
    df['LoanAmount'] = df.apply(lambda row: loanAmount_pivot[row['Education']][row['Gender']][row['Married']] if row['LoanAmount'] else row['LoanAmount'],axis=1)
    df['Loan_Amount_Term'].fillna(360.0, inplace=True)
    df['Credit_History'].fillna(0.0, inplace=True)
    
    return df


def data_process(df):
    df['Credit_History'] = df['Credit_History'].fillna(1.0)
    df['Education'] = df['Education'].apply(lambda edu: 1 if edu == 'Graduate' else 0)
    df['Gender'] = df['Gender'].apply(lambda gender: 0 if gender == 'Female' else 1)
    df['Married'] = df['Married'].apply(lambda gender: 1 if gender == 'Yes' else 1) 
    df['Self_Employed'] = df['Self_Employed'].apply(lambda self_employed: 1 if self_employed == 'Yes' else 0)
#     df['Loan_Status'] = df['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)
    df['MonthPayment'] = df['LoanAmount'] / df['Loan_Amount_Term']
    df['Dependents'] = df['Dependents'].apply(lambda deps: 3 if deps == '3+' else int(deps))
    df = encode_onehot(df, ['Property_Area'])
    return df

fillna_transformer = FunctionTransformer(fill_nans, validate=False)
# training_df = fillna_transformer.transform(training_data)
# training_df.isnull().sum()

In [86]:
train_df = fill_nans(training_data.drop(['Loan_Status'], axis =1))
test_df = fill_nans(testing_data)
train_df.isnull().sum()


Out[86]:
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [87]:
train_df = data_process(train_df)
test_df=data_process(test_df)
train_df.dtypes


Out[87]:
Gender                       int64
Married                      int64
Dependents                   int64
Education                    int64
Self_Employed                int64
ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
MonthPayment               float64
Property_Area=Rural        float64
Property_Area=Semiurban    float64
Property_Area=Urban        float64
dtype: object

In [88]:
# plt.boxplot(train_df['MonthPayment'])
train_df['Property_Area=Urban'].value_counts()


Out[88]:
0.0    412
1.0    202
Name: Property_Area=Urban, dtype: int64

In [89]:
# from sklearn.pipeline import Pipeline

# FEATURE_COLUMNS = ['Credit_History', 'Gender', 'Married', 'Loan_Status']

# def select_columns(X):
#     return X[FEATURE_COLUMNS]

# select_transformer = FunctionTransformer(select_columns, validate=False)

# stage = {('fillna', fillna_transformer)}#, ('selectColumns', select_transformer)}


# feature_process = Pipeline(stage)
# training_features = feature_process.transform(training_data)
# training_features

In [90]:
label = training_data['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)

FEATURE_COLUMNS = ['Gender', 'Married', 'Dependents',
                   'Education', 'Self_Employed', 'ApplicantIncome',
                   'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
                   'Credit_History', 'MonthPayment',
                   'Property_Area=Urban','Property_Area=Rural','Property_Area=Semiurban']

X_train, X_test, y_train, y_test = train_test_split(train_df[FEATURE_COLUMNS], label, test_size=0.1, random_state=1)

def display_result(y_true_, y_pred_):
    accuracy = accuracy_score(y_true=y_true_, y_pred=y_pred_)
    conf_matrix = confusion_matrix(y_true=y_true_, y_pred=y_pred_)
    print("Classification Accuracy: {}".format(accuracy))
    print("Confusion matrix:\n{}".format(conf_matrix))
    
def write_to_csv(filename, loan_ids, predict_results, message=""):
    predict_label= pd.Series(predict_results).replace({0: 'N', 1: 'Y'})
    with open(filename, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(["Loan_ID", "Loan_Status"])
        writer.writerows(zip(loan_ids, predict_label))
    print(message)

Non-trained model


In [91]:
y_pred = train_df['Credit_History']
display_result(label, y_pred)

y_test_pred = [1]*367 #test_df['Credit_History']
write_to_csv('baseline.csv', testing_data['Loan_ID'], y_test_pred, 'baseline')


Classification Accuracy: 0.7703583061889251
Confusion matrix:
[[ 95  97]
 [ 44 378]]
baseline

LR Model


In [92]:
m1_clf = LogisticRegression(C=1)

m1_clf.fit(X_train, y_train)

y_pred = m1_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_pred)

y_pred = m1_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_pred)

# linear model is underfitting, since the dataset is not linearly separatable


Classification Accuracy: 0.7735507246376812
Confusion matrix:
[[ 85  87]
 [ 38 342]]
Classification Accuracy: 0.7580645161290323
Confusion matrix:
[[ 7 13]
 [ 2 40]]

Decision Tree


In [93]:
dt_clf = DecisionTreeClassifier()

dt_clf.fit(X_train, y_train)

# Model is overfitting
y_train_pred = dt_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)

y_test_pred = dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)

#Tunning parameter to prevent overfitting
parameters = {'max_depth': np.arange(3,9,1),
              'min_samples_leaf':np.arange(3,30,2)}
gs_clf = GridSearchCV(dt_clf, parameters, n_jobs=-1, cv=5, refit=True)
gs_clf.fit(X_train, y_train)
gs_clf.grid_scores_

best_dt_clf = gs_clf.best_estimator_
print(best_dt_clf)
best_dt_pred = best_dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=best_dt_pred)

test_dt_pred = best_dt_clf.predict(test_df)
write_to_csv('dt_tuned.csv', testing_data['Loan_ID'], test_dt_pred, 'tuned dt model')


Classification Accuracy: 1.0
Confusion matrix:
[[172   0]
 [  0 380]]
Classification Accuracy: 0.6935483870967742
Confusion matrix:
[[ 9 11]
 [ 8 34]]
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Classification Accuracy: 0.7580645161290323
Confusion matrix:
[[ 8 12]
 [ 3 39]]
tuned dt model

In [94]:
m2_clf = RandomForestClassifier(n_estimators=100)

m2_clf.fit(X_train, y_train)

print(m2_clf)

y_train_pred = m2_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)

y_test_pred = m2_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Classification Accuracy: 1.0
Confusion matrix:
[[172   0]
 [  0 380]]
Classification Accuracy: 0.6935483870967742
Confusion matrix:
[[ 7 13]
 [ 6 36]]

Gradient Boosting Tree


In [95]:
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

#Tunning parameter to prevent overfitting
min_samples_split = 30
min_samples_leaf = 15
max_depth = 5
# max_features = 'sqrt'
subsample = 0.6

gbt_clf = GradientBoostingClassifier(min_samples_split=min_samples_split, 
                                     min_samples_leaf=min_samples_leaf, 
                                     max_depth=max_depth, 
#                                      max_features=max_features, 
                                     learning_rate=0.1,
                                     subsample=subsample, random_state=10)

parameters={'n_estimators':np.arange(5,80,5)}
gbt_search_1 = GridSearchCV(estimator=gbt_clf, param_grid=parameters, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_1.fit(X_train, y_train)
gbt_search_1.grid_scores_
# gbt_search_1.best_estimator_


Out[95]:
[mean: 0.68838, std: 0.02151, params: {'n_estimators': 5},
 mean: 0.73364, std: 0.05421, params: {'n_estimators': 10},
 mean: 0.75893, std: 0.06495, params: {'n_estimators': 15},
 mean: 0.76065, std: 0.07040, params: {'n_estimators': 20},
 mean: 0.76607, std: 0.07255, params: {'n_estimators': 25},
 mean: 0.76062, std: 0.07587, params: {'n_estimators': 30},
 mean: 0.75162, std: 0.07001, params: {'n_estimators': 35},
 mean: 0.75698, std: 0.07476, params: {'n_estimators': 40},
 mean: 0.75880, std: 0.07769, params: {'n_estimators': 45},
 mean: 0.74610, std: 0.07515, params: {'n_estimators': 50},
 mean: 0.74250, std: 0.07145, params: {'n_estimators': 55},
 mean: 0.74071, std: 0.06673, params: {'n_estimators': 60},
 mean: 0.74792, std: 0.07533, params: {'n_estimators': 65},
 mean: 0.74078, std: 0.06456, params: {'n_estimators': 70},
 mean: 0.73539, std: 0.06994, params: {'n_estimators': 75}]

In [96]:
n_estimators = 20
learning_rate = 0.1

In [97]:
param_test2 = {'max_depth':np.arange(3,8,1), 
               'min_samples_split':np.arange(10,40,5)}
gbt_search_2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
                                                                  n_estimators=20, 
                                                                  max_features='sqrt', 
                                                                  subsample=0.8, 
                                                                  random_state=10),
                           param_grid = param_test2,
                           scoring='accuracy',
                           n_jobs=4,iid=False, cv=10)
gbt_search_2.fit(X_train, y_train)
gbt_search_2.grid_scores_, gbt_search_2.best_params_, gbt_search_2.best_score_


Out[97]:
([mean: 0.72643, std: 0.05599, params: {'max_depth': 3, 'min_samples_split': 10},
  mean: 0.72101, std: 0.06252, params: {'max_depth': 3, 'min_samples_split': 15},
  mean: 0.72455, std: 0.06578, params: {'max_depth': 3, 'min_samples_split': 20},
  mean: 0.72818, std: 0.06482, params: {'max_depth': 3, 'min_samples_split': 25},
  mean: 0.72818, std: 0.06168, params: {'max_depth': 3, 'min_samples_split': 30},
  mean: 0.72276, std: 0.05779, params: {'max_depth': 3, 'min_samples_split': 35},
  mean: 0.74451, std: 0.06937, params: {'max_depth': 4, 'min_samples_split': 10},
  mean: 0.72620, std: 0.07538, params: {'max_depth': 4, 'min_samples_split': 15},
  mean: 0.72987, std: 0.08348, params: {'max_depth': 4, 'min_samples_split': 20},
  mean: 0.73721, std: 0.07446, params: {'max_depth': 4, 'min_samples_split': 25},
  mean: 0.72994, std: 0.06415, params: {'max_depth': 4, 'min_samples_split': 30},
  mean: 0.72643, std: 0.05839, params: {'max_depth': 4, 'min_samples_split': 35},
  mean: 0.74455, std: 0.07252, params: {'max_depth': 5, 'min_samples_split': 10},
  mean: 0.74081, std: 0.07110, params: {'max_depth': 5, 'min_samples_split': 15},
  mean: 0.74263, std: 0.08071, params: {'max_depth': 5, 'min_samples_split': 20},
  mean: 0.74812, std: 0.07423, params: {'max_depth': 5, 'min_samples_split': 25},
  mean: 0.73552, std: 0.06606, params: {'max_depth': 5, 'min_samples_split': 30},
  mean: 0.74623, std: 0.05619, params: {'max_depth': 5, 'min_samples_split': 35},
  mean: 0.74273, std: 0.07436, params: {'max_depth': 6, 'min_samples_split': 10},
  mean: 0.74802, std: 0.07248, params: {'max_depth': 6, 'min_samples_split': 15},
  mean: 0.72630, std: 0.06577, params: {'max_depth': 6, 'min_samples_split': 20},
  mean: 0.74990, std: 0.08292, params: {'max_depth': 6, 'min_samples_split': 25},
  mean: 0.74812, std: 0.06916, params: {'max_depth': 6, 'min_samples_split': 30},
  mean: 0.76799, std: 0.07645, params: {'max_depth': 6, 'min_samples_split': 35},
  mean: 0.73172, std: 0.06988, params: {'max_depth': 7, 'min_samples_split': 10},
  mean: 0.75166, std: 0.07262, params: {'max_depth': 7, 'min_samples_split': 15},
  mean: 0.72994, std: 0.08614, params: {'max_depth': 7, 'min_samples_split': 20},
  mean: 0.73714, std: 0.08082, params: {'max_depth': 7, 'min_samples_split': 25},
  mean: 0.74078, std: 0.06983, params: {'max_depth': 7, 'min_samples_split': 30},
  mean: 0.75166, std: 0.06984, params: {'max_depth': 7, 'min_samples_split': 35}],
 {'max_depth': 6, 'min_samples_split': 35},
 0.76798701298701288)

In [98]:
max_depth = 5
min_samples_split= 30

In [99]:
param_test3 = {#'min_samples_split':np.arange(10,80,5), 
               'min_samples_leaf':np.arange(5,30,5)}
gbt_search_3 = GridSearchCV(GradientBoostingClassifier(learning_rate=0.1,
                                                        n_estimators=20, 
                                                        max_features='sqrt', 
                                                        subsample=0.8, 
                                                        random_state=10,
                                                        max_depth = 5,
                                                        min_samples_split= 30),
                             param_grid = param_test3, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_3.fit(X_train, y_train)
gbt_search_3.grid_scores_, gbt_search_3.best_params_, gbt_search_3.best_score_


Out[99]:
([mean: 0.73912, std: 0.06715, params: {'min_samples_leaf': 5},
  mean: 0.74451, std: 0.06439, params: {'min_samples_leaf': 10},
  mean: 0.74094, std: 0.06947, params: {'min_samples_leaf': 15},
  mean: 0.72289, std: 0.06210, params: {'min_samples_leaf': 20},
  mean: 0.72279, std: 0.06766, params: {'min_samples_leaf': 25}],
 {'min_samples_leaf': 10},
 0.74451298701298707)