In [69]:
    
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
training_data = pd.read_csv('../data/train_u6lujuX_CVtuZ9i.csv')
testing_data = pd.read_csv('../data/test_Y3wMUE5_7gLdaTN.csv')
%matplotlib inline
# training_data[training_data.apply(lambda x: x['Credit_History'] == 0 and x['Loan_Status'] == 'Y',axis = 1)]
    
In [70]:
    
training_data.head(10)
    
    Out[70]:
In [71]:
    
training_data.describe()
    
    
    Out[71]:
In [72]:
    
training_data.groupby(['Loan_Status']).describe()
    
    
    Out[72]:
In [73]:
    
training_data.isnull().sum(axis=0)
    
    Out[73]:
In [74]:
    
testing_data.isnull().sum()
    
    Out[74]:
In [75]:
    
training_data['Loan_Status'].value_counts()
    
    Out[75]:
In [76]:
    
credit_loan = pd.crosstab(training_data['Credit_History'], training_data['Loan_Status'], margins=False)
print(credit_loan)
credit_loan_rate = credit_loan.apply(lambda row: row/row.sum(), axis=1)
print(credit_loan_rate)
credit_loan.plot(kind='bar', color = ["red", "blue"])
    
    
    Out[76]:
    
In [77]:
    
# training_data[training_data['Credit_History'].isnull()]
    
In [78]:
    
# testing_data[testing_data['Credit_History'].isnull()]
    
In [79]:
    
# dealing with three missing 'Married' value
training_data['Married'].value_counts()
pd.pivot_table(training_data, values ='LoanAmount', index=['Education'], columns=['Married'], aggfunc=np.median)
    
    Out[79]:
In [80]:
    
training_data[training_data['Dependents'].isnull()]
    
    Out[80]:
In [81]:
    
loanAmount_pivot = pd.pivot_table(training_data, values='LoanAmount', index=['Gender', 'Self_Employed'], columns=['Education'], aggfunc=np.median)
loanAmount_pivot['Graduate']['Female']['No']
    
    Out[81]:
In [82]:
    
# Fill in Gender
# one assumption: if CoapplicantIncome == 0 and married , then Gender == male. case housewife
training_data[(training_data['CoapplicantIncome'] == 0) & (training_data['Married'] == 'No') 
             & (training_data['Self_Employed'] == 'Yes')]['Gender'].value_counts()
    
    Out[82]:
In [83]:
    
training_data['Loan_Amount_Term'].value_counts()
testing_data.count()
    
    Out[83]:
In [84]:
    
training_data['Dependents'].value_counts()
    
    Out[84]:
In [85]:
    
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df
def fill_nans(X, y=None):
    df = X.copy()
    
    df.drop('Loan_ID', axis=1, inplace=True)
    df['Married'] = df.apply(lambda row: 'Yes' if row['CoapplicantIncome'] == 0 else 'No', axis =1) 
    df['Dependents'].fillna(0.0, inplace=True)
    df['Gender'].fillna('Male', inplace=True)
    df['Self_Employed'].fillna('No', inplace=True)
    df['LoanAmount'] = df.apply(lambda row: loanAmount_pivot[row['Education']][row['Gender']][row['Married']] if row['LoanAmount'] else row['LoanAmount'],axis=1)
    df['Loan_Amount_Term'].fillna(360.0, inplace=True)
    df['Credit_History'].fillna(0.0, inplace=True)
    
    return df
def data_process(df):
    df['Credit_History'] = df['Credit_History'].fillna(1.0)
    df['Education'] = df['Education'].apply(lambda edu: 1 if edu == 'Graduate' else 0)
    df['Gender'] = df['Gender'].apply(lambda gender: 0 if gender == 'Female' else 1)
    df['Married'] = df['Married'].apply(lambda gender: 1 if gender == 'Yes' else 1) 
    df['Self_Employed'] = df['Self_Employed'].apply(lambda self_employed: 1 if self_employed == 'Yes' else 0)
#     df['Loan_Status'] = df['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)
    df['MonthPayment'] = df['LoanAmount'] / df['Loan_Amount_Term']
    df['Dependents'] = df['Dependents'].apply(lambda deps: 3 if deps == '3+' else int(deps))
    df = encode_onehot(df, ['Property_Area'])
    return df
fillna_transformer = FunctionTransformer(fill_nans, validate=False)
# training_df = fillna_transformer.transform(training_data)
# training_df.isnull().sum()
    
In [86]:
    
train_df = fill_nans(training_data.drop(['Loan_Status'], axis =1))
test_df = fill_nans(testing_data)
train_df.isnull().sum()
    
    Out[86]:
In [87]:
    
train_df = data_process(train_df)
test_df=data_process(test_df)
train_df.dtypes
    
    Out[87]:
In [88]:
    
# plt.boxplot(train_df['MonthPayment'])
train_df['Property_Area=Urban'].value_counts()
    
    Out[88]:
In [89]:
    
# from sklearn.pipeline import Pipeline
# FEATURE_COLUMNS = ['Credit_History', 'Gender', 'Married', 'Loan_Status']
# def select_columns(X):
#     return X[FEATURE_COLUMNS]
# select_transformer = FunctionTransformer(select_columns, validate=False)
# stage = {('fillna', fillna_transformer)}#, ('selectColumns', select_transformer)}
# feature_process = Pipeline(stage)
# training_features = feature_process.transform(training_data)
# training_features
    
In [90]:
    
label = training_data['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)
FEATURE_COLUMNS = ['Gender', 'Married', 'Dependents',
                   'Education', 'Self_Employed', 'ApplicantIncome',
                   'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
                   'Credit_History', 'MonthPayment',
                   'Property_Area=Urban','Property_Area=Rural','Property_Area=Semiurban']
X_train, X_test, y_train, y_test = train_test_split(train_df[FEATURE_COLUMNS], label, test_size=0.1, random_state=1)
def display_result(y_true_, y_pred_):
    accuracy = accuracy_score(y_true=y_true_, y_pred=y_pred_)
    conf_matrix = confusion_matrix(y_true=y_true_, y_pred=y_pred_)
    print("Classification Accuracy: {}".format(accuracy))
    print("Confusion matrix:\n{}".format(conf_matrix))
    
def write_to_csv(filename, loan_ids, predict_results, message=""):
    predict_label= pd.Series(predict_results).replace({0: 'N', 1: 'Y'})
    with open(filename, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(["Loan_ID", "Loan_Status"])
        writer.writerows(zip(loan_ids, predict_label))
    print(message)
    
In [91]:
    
y_pred = train_df['Credit_History']
display_result(label, y_pred)
y_test_pred = [1]*367 #test_df['Credit_History']
write_to_csv('baseline.csv', testing_data['Loan_ID'], y_test_pred, 'baseline')
    
    
In [92]:
    
m1_clf = LogisticRegression(C=1)
m1_clf.fit(X_train, y_train)
y_pred = m1_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_pred)
y_pred = m1_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_pred)
# linear model is underfitting, since the dataset is not linearly separatable
    
    
In [93]:
    
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
# Model is overfitting
y_train_pred = dt_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)
y_test_pred = dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)
#Tunning parameter to prevent overfitting
parameters = {'max_depth': np.arange(3,9,1),
              'min_samples_leaf':np.arange(3,30,2)}
gs_clf = GridSearchCV(dt_clf, parameters, n_jobs=-1, cv=5, refit=True)
gs_clf.fit(X_train, y_train)
gs_clf.grid_scores_
best_dt_clf = gs_clf.best_estimator_
print(best_dt_clf)
best_dt_pred = best_dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=best_dt_pred)
test_dt_pred = best_dt_clf.predict(test_df)
write_to_csv('dt_tuned.csv', testing_data['Loan_ID'], test_dt_pred, 'tuned dt model')
    
    
In [94]:
    
m2_clf = RandomForestClassifier(n_estimators=100)
m2_clf.fit(X_train, y_train)
print(m2_clf)
y_train_pred = m2_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)
y_test_pred = m2_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)
    
    
In [95]:
    
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
#Tunning parameter to prevent overfitting
min_samples_split = 30
min_samples_leaf = 15
max_depth = 5
# max_features = 'sqrt'
subsample = 0.6
gbt_clf = GradientBoostingClassifier(min_samples_split=min_samples_split, 
                                     min_samples_leaf=min_samples_leaf, 
                                     max_depth=max_depth, 
#                                      max_features=max_features, 
                                     learning_rate=0.1,
                                     subsample=subsample, random_state=10)
parameters={'n_estimators':np.arange(5,80,5)}
gbt_search_1 = GridSearchCV(estimator=gbt_clf, param_grid=parameters, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_1.fit(X_train, y_train)
gbt_search_1.grid_scores_
# gbt_search_1.best_estimator_
    
    Out[95]:
In [96]:
    
n_estimators = 20
learning_rate = 0.1
    
In [97]:
    
param_test2 = {'max_depth':np.arange(3,8,1), 
               'min_samples_split':np.arange(10,40,5)}
gbt_search_2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
                                                                  n_estimators=20, 
                                                                  max_features='sqrt', 
                                                                  subsample=0.8, 
                                                                  random_state=10),
                           param_grid = param_test2,
                           scoring='accuracy',
                           n_jobs=4,iid=False, cv=10)
gbt_search_2.fit(X_train, y_train)
gbt_search_2.grid_scores_, gbt_search_2.best_params_, gbt_search_2.best_score_
    
    Out[97]:
In [98]:
    
max_depth = 5
min_samples_split= 30
    
In [99]:
    
param_test3 = {#'min_samples_split':np.arange(10,80,5), 
               'min_samples_leaf':np.arange(5,30,5)}
gbt_search_3 = GridSearchCV(GradientBoostingClassifier(learning_rate=0.1,
                                                        n_estimators=20, 
                                                        max_features='sqrt', 
                                                        subsample=0.8, 
                                                        random_state=10,
                                                        max_depth = 5,
                                                        min_samples_split= 30),
                             param_grid = param_test3, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_3.fit(X_train, y_train)
gbt_search_3.grid_scores_, gbt_search_3.best_params_, gbt_search_3.best_score_
    
    Out[99]: