In [69]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
training_data = pd.read_csv('../data/train_u6lujuX_CVtuZ9i.csv')
testing_data = pd.read_csv('../data/test_Y3wMUE5_7gLdaTN.csv')
%matplotlib inline
# training_data[training_data.apply(lambda x: x['Credit_History'] == 0 and x['Loan_Status'] == 'Y',axis = 1)]
In [70]:
training_data.head(10)
Out[70]:
In [71]:
training_data.describe()
Out[71]:
In [72]:
training_data.groupby(['Loan_Status']).describe()
Out[72]:
In [73]:
training_data.isnull().sum(axis=0)
Out[73]:
In [74]:
testing_data.isnull().sum()
Out[74]:
In [75]:
training_data['Loan_Status'].value_counts()
Out[75]:
In [76]:
credit_loan = pd.crosstab(training_data['Credit_History'], training_data['Loan_Status'], margins=False)
print(credit_loan)
credit_loan_rate = credit_loan.apply(lambda row: row/row.sum(), axis=1)
print(credit_loan_rate)
credit_loan.plot(kind='bar', color = ["red", "blue"])
Out[76]:
In [77]:
# training_data[training_data['Credit_History'].isnull()]
In [78]:
# testing_data[testing_data['Credit_History'].isnull()]
In [79]:
# dealing with three missing 'Married' value
training_data['Married'].value_counts()
pd.pivot_table(training_data, values ='LoanAmount', index=['Education'], columns=['Married'], aggfunc=np.median)
Out[79]:
In [80]:
training_data[training_data['Dependents'].isnull()]
Out[80]:
In [81]:
loanAmount_pivot = pd.pivot_table(training_data, values='LoanAmount', index=['Gender', 'Self_Employed'], columns=['Education'], aggfunc=np.median)
loanAmount_pivot['Graduate']['Female']['No']
Out[81]:
In [82]:
# Fill in Gender
# one assumption: if CoapplicantIncome == 0 and married , then Gender == male. case housewife
training_data[(training_data['CoapplicantIncome'] == 0) & (training_data['Married'] == 'No')
& (training_data['Self_Employed'] == 'Yes')]['Gender'].value_counts()
Out[82]:
In [83]:
training_data['Loan_Amount_Term'].value_counts()
testing_data.count()
Out[83]:
In [84]:
training_data['Dependents'].value_counts()
Out[84]:
In [85]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
def encode_onehot(df, cols):
"""
One-hot encoding is applied to columns specified in a pandas DataFrame.
Modified from: https://gist.github.com/kljensen/5452382
Details:
http://en.wikipedia.org/wiki/One-hot
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
@param df pandas DataFrame
@param cols a list of columns to encode
@return a DataFrame with one-hot encoding
"""
vec = DictVectorizer()
vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
vec_data.columns = vec.get_feature_names()
vec_data.index = df.index
df = df.drop(cols, axis=1)
df = df.join(vec_data)
return df
def fill_nans(X, y=None):
df = X.copy()
df.drop('Loan_ID', axis=1, inplace=True)
df['Married'] = df.apply(lambda row: 'Yes' if row['CoapplicantIncome'] == 0 else 'No', axis =1)
df['Dependents'].fillna(0.0, inplace=True)
df['Gender'].fillna('Male', inplace=True)
df['Self_Employed'].fillna('No', inplace=True)
df['LoanAmount'] = df.apply(lambda row: loanAmount_pivot[row['Education']][row['Gender']][row['Married']] if row['LoanAmount'] else row['LoanAmount'],axis=1)
df['Loan_Amount_Term'].fillna(360.0, inplace=True)
df['Credit_History'].fillna(0.0, inplace=True)
return df
def data_process(df):
df['Credit_History'] = df['Credit_History'].fillna(1.0)
df['Education'] = df['Education'].apply(lambda edu: 1 if edu == 'Graduate' else 0)
df['Gender'] = df['Gender'].apply(lambda gender: 0 if gender == 'Female' else 1)
df['Married'] = df['Married'].apply(lambda gender: 1 if gender == 'Yes' else 1)
df['Self_Employed'] = df['Self_Employed'].apply(lambda self_employed: 1 if self_employed == 'Yes' else 0)
# df['Loan_Status'] = df['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)
df['MonthPayment'] = df['LoanAmount'] / df['Loan_Amount_Term']
df['Dependents'] = df['Dependents'].apply(lambda deps: 3 if deps == '3+' else int(deps))
df = encode_onehot(df, ['Property_Area'])
return df
fillna_transformer = FunctionTransformer(fill_nans, validate=False)
# training_df = fillna_transformer.transform(training_data)
# training_df.isnull().sum()
In [86]:
train_df = fill_nans(training_data.drop(['Loan_Status'], axis =1))
test_df = fill_nans(testing_data)
train_df.isnull().sum()
Out[86]:
In [87]:
train_df = data_process(train_df)
test_df=data_process(test_df)
train_df.dtypes
Out[87]:
In [88]:
# plt.boxplot(train_df['MonthPayment'])
train_df['Property_Area=Urban'].value_counts()
Out[88]:
In [89]:
# from sklearn.pipeline import Pipeline
# FEATURE_COLUMNS = ['Credit_History', 'Gender', 'Married', 'Loan_Status']
# def select_columns(X):
# return X[FEATURE_COLUMNS]
# select_transformer = FunctionTransformer(select_columns, validate=False)
# stage = {('fillna', fillna_transformer)}#, ('selectColumns', select_transformer)}
# feature_process = Pipeline(stage)
# training_features = feature_process.transform(training_data)
# training_features
In [90]:
label = training_data['Loan_Status'].apply(lambda loan_status: 1 if loan_status == 'Y' else 0)
FEATURE_COLUMNS = ['Gender', 'Married', 'Dependents',
'Education', 'Self_Employed', 'ApplicantIncome',
'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
'Credit_History', 'MonthPayment',
'Property_Area=Urban','Property_Area=Rural','Property_Area=Semiurban']
X_train, X_test, y_train, y_test = train_test_split(train_df[FEATURE_COLUMNS], label, test_size=0.1, random_state=1)
def display_result(y_true_, y_pred_):
accuracy = accuracy_score(y_true=y_true_, y_pred=y_pred_)
conf_matrix = confusion_matrix(y_true=y_true_, y_pred=y_pred_)
print("Classification Accuracy: {}".format(accuracy))
print("Confusion matrix:\n{}".format(conf_matrix))
def write_to_csv(filename, loan_ids, predict_results, message=""):
predict_label= pd.Series(predict_results).replace({0: 'N', 1: 'Y'})
with open(filename, 'w') as f:
writer = csv.writer(f, delimiter=',')
writer.writerow(["Loan_ID", "Loan_Status"])
writer.writerows(zip(loan_ids, predict_label))
print(message)
In [91]:
y_pred = train_df['Credit_History']
display_result(label, y_pred)
y_test_pred = [1]*367 #test_df['Credit_History']
write_to_csv('baseline.csv', testing_data['Loan_ID'], y_test_pred, 'baseline')
In [92]:
m1_clf = LogisticRegression(C=1)
m1_clf.fit(X_train, y_train)
y_pred = m1_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_pred)
y_pred = m1_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_pred)
# linear model is underfitting, since the dataset is not linearly separatable
In [93]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
# Model is overfitting
y_train_pred = dt_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)
y_test_pred = dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)
#Tunning parameter to prevent overfitting
parameters = {'max_depth': np.arange(3,9,1),
'min_samples_leaf':np.arange(3,30,2)}
gs_clf = GridSearchCV(dt_clf, parameters, n_jobs=-1, cv=5, refit=True)
gs_clf.fit(X_train, y_train)
gs_clf.grid_scores_
best_dt_clf = gs_clf.best_estimator_
print(best_dt_clf)
best_dt_pred = best_dt_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=best_dt_pred)
test_dt_pred = best_dt_clf.predict(test_df)
write_to_csv('dt_tuned.csv', testing_data['Loan_ID'], test_dt_pred, 'tuned dt model')
In [94]:
m2_clf = RandomForestClassifier(n_estimators=100)
m2_clf.fit(X_train, y_train)
print(m2_clf)
y_train_pred = m2_clf.predict(X_train)
display_result(y_true_=y_train, y_pred_=y_train_pred)
y_test_pred = m2_clf.predict(X_test)
display_result(y_true_=y_test, y_pred_=y_test_pred)
In [95]:
from sklearn.ensemble import GradientBoostingClassifier #GBM algorithm
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
#Tunning parameter to prevent overfitting
min_samples_split = 30
min_samples_leaf = 15
max_depth = 5
# max_features = 'sqrt'
subsample = 0.6
gbt_clf = GradientBoostingClassifier(min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
max_depth=max_depth,
# max_features=max_features,
learning_rate=0.1,
subsample=subsample, random_state=10)
parameters={'n_estimators':np.arange(5,80,5)}
gbt_search_1 = GridSearchCV(estimator=gbt_clf, param_grid=parameters, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_1.fit(X_train, y_train)
gbt_search_1.grid_scores_
# gbt_search_1.best_estimator_
Out[95]:
In [96]:
n_estimators = 20
learning_rate = 0.1
In [97]:
param_test2 = {'max_depth':np.arange(3,8,1),
'min_samples_split':np.arange(10,40,5)}
gbt_search_2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
n_estimators=20,
max_features='sqrt',
subsample=0.8,
random_state=10),
param_grid = param_test2,
scoring='accuracy',
n_jobs=4,iid=False, cv=10)
gbt_search_2.fit(X_train, y_train)
gbt_search_2.grid_scores_, gbt_search_2.best_params_, gbt_search_2.best_score_
Out[97]:
In [98]:
max_depth = 5
min_samples_split= 30
In [99]:
param_test3 = {#'min_samples_split':np.arange(10,80,5),
'min_samples_leaf':np.arange(5,30,5)}
gbt_search_3 = GridSearchCV(GradientBoostingClassifier(learning_rate=0.1,
n_estimators=20,
max_features='sqrt',
subsample=0.8,
random_state=10,
max_depth = 5,
min_samples_split= 30),
param_grid = param_test3, scoring='accuracy',n_jobs=4,iid=False, cv=10)
gbt_search_3.fit(X_train, y_train)
gbt_search_3.grid_scores_, gbt_search_3.best_params_, gbt_search_3.best_score_
Out[99]: