In [420]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import ttest_ind
sns.set_style('white')
For investors it's interesting to know which characteristics of a loan are predictive of a loan ending in charged off. Lending club has its own algorithms beforehand that they use to predict which loans are riskier and give these a grade (A-F). This correlates well with the probability of charged off as we saw in the exploration of the dataset. The interest rates should reflect the risk (higher interest with more risk) to make the riskier loans still attractive to invest in. Although grade and interest correlates well, it's not a perfect correlation.
We will here use the loans that went to full term to build classifiers that can classify loans into charged off and fully paid. The accuracy measure used is 'f1_weighted' of sklearn. This score can be interpreted as a weighted average of the precision and recall. Also confusion matrices and ROC curves will be used for analysis. Grade is used as baseline prediction for charged off/fully paid. We will look for features that add extra predictive value on top of the grade feature and see if this gives us any insight.
In [341]:
loans = pd.read_csv('../data/loan.csv')
closed_loans = loans[loans['loan_status'].isin(['Fully Paid', 'Charged Off'])]
print(closed_loans.shape)
round(sum(closed_loans['loan_status']=='Charged Off')/len(closed_loans['loan_status'])*100)
Out[341]:
We selected features that can be included for the prediction. For this we left out features that are not known at the beginning, like 'total payment'. Because these are not useful features to help new investors. Also non-predictive features like 'id' or features that have all the same values are also excluded. All features to do with 'joint' loans are also excluded, since we do not have joint loans. We did add loan_status and charged_off for the prediction. Furthermore, features that were missing in more than 5% of the loans were excluded leaving 24 features (excluding the targets loan_status and charged_off).
In [342]:
include = ['term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
'annual_inc', 'purpose', 'zip_code', 'addr_state', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
'mths_since_last_major_derog', 'acc_now_delinq', 'loan_amnt', 'open_il_6m', 'open_il_12m',
'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'dti', 'open_acc_6m', 'tot_cur_bal',
'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl',
'inq_last_12m', 'issue_d', 'loan_status']
exclude = ['funded_amnt', 'funded_amnt_inv', 'verfication_status', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
'initial_list_status', 'id', 'member_id', 'emp_title', 'pymnt_plan', 'url', 'desc', 'title',
'out_prncp', 'out_prncp_inv', 'total_pymnt', 'last_pymnt_amnt', 'next_pymnt_d', 'policy_code',
'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'tot_coll_amt',
]
# exclude the one joint application
closed_loans = closed_loans[closed_loans['application_type'] == 'INDIVIDUAL']
# make id index
closed_loans.index = closed_loans.id
# include only the features above
closed_loans = closed_loans[include]
# exclude features with more than 5% missing values
columns_not_missing = (closed_loans.isnull().apply(sum, 0) / len(closed_loans)) < 0.1
closed_loans = closed_loans.loc[:,columns_not_missing[columns_not_missing].index]
# delete rows with NANs
print(1 - closed_loans.dropna().shape[0] / closed_loans.shape[0]) # ratio deleted rows
closed_loans = closed_loans.dropna()
# calculate nr of days between earliest creditline and issue date of the loan
# delete the two original features
closed_loans['earliest_cr_line'] = pd.to_datetime(closed_loans['earliest_cr_line'])
closed_loans['issue_d'] = pd.to_datetime(closed_loans['issue_d'])
closed_loans['days_since_first_credit_line'] = closed_loans['issue_d'] - closed_loans['earliest_cr_line']
closed_loans['days_since_first_credit_line'] = closed_loans['days_since_first_credit_line'] / np.timedelta64(1, 'D')
closed_loans = closed_loans.drop(['earliest_cr_line', 'issue_d'], axis=1)
# delete redundant features
#closed_loans = closed_loans.drop(['grade'], axis=1)
# round-up annual_inc and cut-off outliers annual_inc at 200.000
closed_loans['annual_inc'] = np.ceil(closed_loans['annual_inc'] / 1000)
closed_loans.loc[closed_loans['annual_inc'] > 200, 'annual_inc'] = 200
closed_loans.shape
Out[342]:
In [343]:
closed_loans.head()
Out[343]:
In [229]:
closed_loans.columns
Out[229]:
In [230]:
plt.hist(closed_loans['annual_inc'], bins=100)
Out[230]:
We keep 30% of the data separate for now so we can later use this to reliable test the performance of the classifier. The split is stratified by 'loan_status' in order to equally divide old loans over the split (old loans have a higher 'charged_off' probability). The classes to predict are in the variable 'charged_off'.
In [237]:
X_train, X_test, y_train, y_test = train_test_split(closed_loans, closed_loans['loan_status'],
test_size=0.3, random_state=123)
X_train = X_train.drop('loan_status', axis=1)
X_test = X_test.drop('loan_status', axis=1)
We will first start with the logistic regression classifier. This is a simple classifier that uses a sigmoidal curve to predict from the features to which class the sample belongs. It has one parameter to tune namely the C-parameter. This is the inverse of the regularization strength, smaller values specify stronger regularization. In sklearn the features have to be numerical that we input in this algorithm, so we need to convert the categorical features to numeric. To do this ordered categorical features will have adjacent numbers and unordered features will get an order as best as possible during conversion to numeric, for instance geographical. Also there cannot be nan/inf/-inf values, hence these will be made 0's. With this algorithm we will also have to scale and normalize the features.
Non-numeric features were converted as follows:
In [239]:
# features that are not float or int, so not to be converted:
# ordered:
# sub_grade, emp_length, zip_code, term
# unordered:
# home_ownership, purpose, addr_state (ordered geographically)
# term
X_train['term'] = X_train['term'].apply(lambda x: int(x.split(' ')[1]))
# grade
loans['grade'] = loans['grade'].astype('category')
grade_dict = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
X_train['grade'] = X_train['grade'].apply(lambda x: grade_dict[x])
# emp_length
emp_length_dict = {'n/a':0,
'< 1 year':0,
'1 year':1,
'2 years':2,
'3 years':3,
'4 years':4,
'5 years':5,
'6 years':6,
'7 years':7,
'8 years':8,
'9 years':9,
'10+ years':10}
X_train['emp_length'] = X_train['emp_length'].apply(lambda x: emp_length_dict[x])
# zipcode
X_train['zip_code'] = X_train['zip_code'].apply(lambda x: int(x[0:3]))
# subgrade
X_train['sub_grade'] = X_train['grade'] + X_train['sub_grade'].apply(lambda x: float(list(x)[1])/10)
# house
house_dict = {'NONE': 0, 'OTHER': 0, 'ANY': 0, 'RENT': 1, 'MORTGAGE': 2, 'OWN': 3}
X_train['home_ownership'] = X_train['home_ownership'].apply(lambda x: house_dict[x])
# purpose
purpose_dict = {'other': 0, 'small_business': 1, 'renewable_energy': 2, 'home_improvement': 3,
'house': 4, 'educational': 5, 'medical': 6, 'moving': 7, 'car': 8,
'major_purchase': 9, 'wedding': 10, 'vacation': 11, 'credit_card': 12,
'debt_consolidation': 13}
X_train['purpose'] = X_train['purpose'].apply(lambda x: purpose_dict[x])
# states
state_dict = {'AK': 0, 'WA': 1, 'ID': 2, 'MT': 3, 'ND': 4, 'MN': 5,
'OR': 6, 'WY': 7, 'SD': 8, 'WI': 9, 'MI': 10, 'NY': 11,
'VT': 12, 'NH': 13, 'MA': 14, 'CT': 15, 'RI': 16, 'ME': 17,
'CA': 18, 'NV': 19, 'UT': 20, 'CO': 21, 'NE': 22, 'IA': 23,
'KS': 24, 'MO': 25, 'IL': 26, 'IN': 27, 'OH': 28, 'PA': 29,
'NJ': 30, 'KY': 31, 'WV': 32, 'VA': 33, 'DC': 34, 'MD': 35,
'DE': 36, 'AZ': 37, 'NM': 38, 'OK': 39, 'AR': 40, 'TN': 41,
'NC': 42, 'TX': 43, 'LA': 44, 'MS': 45, 'AL': 46, 'GA': 47,
'SC': 48, 'FL': 49, 'HI': 50}
X_train['addr_state'] = X_train['addr_state'].apply(lambda x: state_dict[x])
# make NA's, inf and -inf 0
X_train = X_train.fillna(0)
X_train = X_train.replace([np.inf, -np.inf], 0)
In [240]:
X_train.columns
Out[240]:
In [241]:
# scaling and normalizing the features
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
After the categorical features are conversed to numeric an normalized/scaled, we will first check what the accuracy is when only using the feature 'grade' (A-F) to predict 'charged off' (True/False). This is the classification lending club gave the loans. The closer to F the higher the chance the loan will end in 'charged off'. For the accuracy estimation we will use 'F1-weighted'. This stands for F1 = 2 (precision recall) / (precision + recall). In this way both precision and recall is important for the accuracy. Precision is the number of correct positive results divided by the number of all positive results, and recall is the number of correct positive results divided by the number of positive results that should have been returned. The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst at 0. In this case using only 'grade' as feature, using the default parameter value for C (inverse of regularization strength) and using l1/lasso penalization we get an F1-accuracy of 0.744.
In [242]:
clf = LogisticRegression(penalty='l1')
scores = cross_val_score(clf, X_train_scaled.loc[:,['grade']], y_train, cv=10, scoring='f1_weighted')
print(scores)
print(np.mean(scores))
A score of 0.744 looks not really high but still a lot better than random. Nevertheless, if we look into the confusion matrix and the ROC-curve we see a whole other picture. It turns out the algorithm mostly predicts everything in the not charged off group and therefore gets the majority right, because there are a lot more paid loans than charged off loans (18%). The area under the curve even gives only a score of 0.506 while random is 0.5. The prediction with logistic regression and only feature grade is therefore only as good as random.
In [243]:
from sklearn.model_selection import cross_val_predict
from pandas_confusion import ConfusionMatrix
prediction = cross_val_predict(clf, X_train_scaled.loc[:,['grade']], y_train, cv=10)
confusion_matrix = ConfusionMatrix(y_train, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[243]:
In [251]:
y_score = cross_val_predict(clf, X_train_scaled.loc[:,['grade']], y_train, cv=10, method='predict_proba')
fpr, tpr, thresholds = roc_curve(y_train, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[251]:
We can now include all the features we selected (24) and see if the prediction will be better. Because we use regularization, the effect of not useful features will be downgraded automatically. This leads to a slightly better F1-score of 0.751. Also the confusion matrix and the ROC-curve/AUC-score are a little better. Although still not great with an AUC score of 0.515. The top-5 features most used by the algorithm are: 'funded_amnt_inv', 'int_rate', 'sub_grade', 'funded_amnt' and 'annual_inc'. Not even grade itself.
In [256]:
clf = LogisticRegression(penalty='l1')
scores = cross_val_score(clf, X_train_scaled, y_train, cv=10, scoring='f1_weighted')
print(scores)
print(np.mean(scores))
In [257]:
prediction = cross_val_predict(clf, X_train_scaled, y_train, cv=10)
confusion_matrix = ConfusionMatrix(y_train, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[257]:
In [258]:
y_score = cross_val_predict(clf, X_train_scaled, y_train, cv=10, method='predict_proba')
fpr, tpr, thresholds = roc_curve(y_train, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[258]:
In [281]:
clf = LogisticRegression(penalty='l1', C=10)
clf.fit(X_train_scaled, y_train)
coefs = clf.coef_
# find index of top 5 highest coefficients, aka most used features for prediction
positions = abs(coefs[0]).argsort()[-5:][::-1]
print(X_train_scaled.columns[positions])
print(coefs[0][positions])
We can also pick only 5 features and see if this works better. But it works exactly the same as only grade. So SelectKBest does not work as well with 5 features as using all features.
In [260]:
new_X = (SelectKBest(mutual_info_classif, k=5)
.fit_transform(X_train_scaled, y_train))
In [262]:
print(new_X[0]) # term, int_rate, installement, grade, sub_grade
print(X_train_scaled.head())
new_X = pd.DataFrame(new_X, columns=['term', 'int_rate', 'installment', 'grade', 'sub_grade'])
In [263]:
clf = LogisticRegression(penalty='l1')
scores = cross_val_score(clf, new_X, y_train, cv=10, scoring='f1_weighted')
print(scores)
print(np.mean(scores))
In [264]:
prediction = cross_val_predict(clf, new_X, y_train, cv=10)
confusion_matrix = ConfusionMatrix(y_train, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[264]:
In [269]:
y_score = cross_val_predict(clf, new_X, y_train, cv=10, method='predict_proba')
fpr, tpr, thresholds = roc_curve(y_train, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[269]:
To see the statistical relevance of certain features, we can use the statsmodels package. We first use it with the 5 features selected by SelectKBest. We see there that only term, int_rate and installement are relevant. Confidence interval of all is small, but the coefficients of alle are also very close to 0, so do not seem to have a huge influence.
Subsequently we do the same for the 5 features with the highest coefficients in the regularized logistic regression that uses all features. Of these all features seem useful, except for 'sub_grade'. The coefficients are slightly higher and confidence intervals are small. Although the conclusions are contradictory. Funded_amnt and funded_amnt_inv have the highest coefficients. These two values should be roughly the same, but have a contradictory relation with the target value charged_off. This makes no sense and gives the idea that the algorithm is still pretty random.
In [274]:
y_train == 'Charged Off'
Out[274]:
In [275]:
import statsmodels.api as sm
print(new_X.columns)
logit = sm.Logit(y_train == 'Charged Off', np.array(new_X))
result = logit.fit()
print(result.summary())
In [277]:
logit = sm.Logit(y_train == 'Charged Off', np.array(
X_train_scaled.loc[:,['int_rate', 'annual_inc', 'sub_grade', 'term', 'dti']]))
result = logit.fit()
print(result.summary())
Another way to possibly increase performance is to tune the C (penalization) parameter. We will do this with the GridSearchCV function of sklearn. The best performing C parameter, although really close with the default, is C=1. Giving an accuracy of 0.752. (code is quoted out because it takes a long time to run)
In [278]:
from sklearn.model_selection import GridSearchCV
dict_Cs = {'C': [0.001, 0.1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(penalty='l1'), dict_Cs, 'f1_weighted', cv=10)
clf.fit(X_train_scaled, y_train)
print(clf.best_params_)
print(clf.best_score_)
In [280]:
clf = LogisticRegression(penalty='l1', C=10)
scores = cross_val_score(clf, X_train_scaled, y_train, cv=10, scoring='f1_weighted')
print(scores)
print(np.mean(scores))
In [282]:
prediction = cross_val_predict(clf, X_train_scaled, y_train, cv=10)
confusion_matrix = ConfusionMatrix(y_train, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[282]:
In [283]:
y_score = cross_val_predict(clf, X_train_scaled, y_train, cv=10, method='predict_proba')
fpr, tpr, thresholds = roc_curve(y_train, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[283]:
To improve accuracy levels we could use a more complicated algorithm that scores well in a lot of cases, namely random forest. This algorithm makes various decision trees from subsets of the samples and uses at each split only a fraction of the features to prevent overfitting. The random forest algorithm is known to be not very sensitive to the values of its parameters: the number of features used at each split and the number of trees in the forest. Nevertheless, the default of sklearn is so low that we will raise the number of trees to 100. The algorithm has feature selection already builtin (at each split) and scaling/normalization is also not necessary.
We will first run the algorithm with only grade. This makes not that much sense for Random Forest, since it builds trees. And you cannot build a tree from only one feature. Nevertheless, this will be our starting point. The F1 score is 0.739 hence slightly lower than logistic regression. As expected is the confusion matrix dramatic, namely the algorithm turns out to just predict everything as fully paid. And that's why the AUC-score is exactly random.
In [284]:
clf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(clf, X_train.loc[:,['grade']], y_train, cv=10, scoring='f1_weighted')
print(scores)
print(np.mean(scores))
In [285]:
prediction = cross_val_predict(clf, X_train.loc[:,['grade']], y_train, cv=10)
confusion_matrix = ConfusionMatrix(y_train, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[285]:
In [287]:
y_score = cross_val_predict(clf, X_train.loc[:,['grade']], y_train, cv=10, method='predict_proba')
fpr, tpr, thresholds = roc_curve(y_train, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[287]:
Trying the algorithm with all the features (24) leads to a slightly higher F1-score of 0.750. But logistic regression with all features was a fraction better than that. Also the confusion matrix and AUC is comparable but slightly worse than the logistic regression algorithm with all features. The random forest classifier does select a different top-5 features, namely 'dti', 'revol_bal', 'revol_util', 'annual_inc' and 'int_rate'.
In [288]:
clf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_weighted')
print(scores)
print(np.mean(scores))
In [289]:
prediction = cross_val_predict(clf, X_train, y_train, cv=10)
confusion_matrix = ConfusionMatrix(y_train, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[289]:
In [290]:
y_score = cross_val_predict(clf, X_train, y_train, cv=10, method='predict_proba')
fpr, tpr, thresholds = roc_curve(y_train, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[290]:
In [291]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
feat_imp = clf.feature_importances_
In [292]:
sns.barplot(x=X_train.columns, y=feat_imp, color='turquoise')
plt.xticks(rotation=90)
Out[292]:
In [293]:
positions = abs(feat_imp).argsort()[-5:][::-1]
print(X_train.columns[positions])
print(feat_imp[positions])
To test the accuracies of our algorithms we first have to do the same transformations on the test set as we did on the training set. So we will transform the categorical features to numerical and replace nan/inf/-inf with 0. Also for the logistic regression algorithm we normalized and scaled the training set and we saved these transformations, so we can do the exact same tranformation on the test set.
In [294]:
# term
X_test['term'] = X_test['term'].apply(lambda x: int(x.split(' ')[1]))
# grade
X_test['grade'] = X_test['grade'].apply(lambda x: grade_dict[x])
# emp_length
X_test['emp_length'] = X_test['emp_length'].apply(lambda x: emp_length_dict[x])
# zipcode
X_test['zip_code'] = X_test['zip_code'].apply(lambda x: int(x[0:3]))
# subgrade
X_test['sub_grade'] = X_test['grade'] + X_test['sub_grade'].apply(lambda x: float(list(x)[1])/10)
# house
X_test['home_ownership'] = X_test['home_ownership'].apply(lambda x: house_dict[x])
# purpose
X_test['purpose'] = X_test['purpose'].apply(lambda x: purpose_dict[x])
# states
X_test['addr_state'] = X_test['addr_state'].apply(lambda x: state_dict[x])
# make NA's, inf and -inf 0
X_test = X_test.fillna(0)
X_test = X_test.replace([np.inf, -np.inf], 0)
In [295]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
For logistic regression we will test both the 'only grade' algorithm (baseline) and the best performing algorithm (C=1, all features with regularization). We find practically the same F-scores/confusion matrices/ROC-curves/AUC-scores as for the training set. Therefore the crossvalidation scheme used on the training set gives reliable accuracy measurements. But it's clear that the predictive value of the algorithm increases slightly with more features, but it's basically predicting that all loans get fully paid and therefore the accuracy scores are practically random.
In [296]:
from sklearn.metrics import f1_score
clf = LogisticRegression(penalty='l1', C=10)
clf.fit(X_train_scaled.loc[:,['grade']], y_train)
prediction = clf.predict(X_test_scaled.loc[:,['grade']])
print(f1_score(y_test, prediction, average='weighted'))
confusion_matrix = ConfusionMatrix(y_test, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[296]:
In [297]:
y_score = clf.predict_proba(X_test_scaled.loc[:,['grade']])
print(clf.classes_)
fpr, tpr, thresholds = roc_curve(y_test, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[297]:
In [298]:
clf = LogisticRegression(penalty='l1', C=10)
clf.fit(X_train_scaled, y_train)
prediction = clf.predict(X_test_scaled)
print(f1_score(y_test, prediction, average='weighted'))
confusion_matrix = ConfusionMatrix(y_test, prediction)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[298]:
In [299]:
y_score = clf.predict_proba(X_test_scaled)
print(clf.classes_)
fpr, tpr, thresholds = roc_curve(y_test, y_score[:,0], pos_label='Charged Off')
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[299]:
In [ ]:
In [ ]:
In [ ]:
In [344]:
closed_loans2 = closed_loans.drop(['loan_status'], axis=1)
# term
closed_loans2['term'] = closed_loans2['term'].apply(lambda x: int(x.split(' ')[1]))
# grade
closed_loans2['grade'] = closed_loans2['grade'].apply(lambda x: grade_dict[x])
# emp_length
closed_loans2['emp_length'] = closed_loans2['emp_length'].apply(lambda x: emp_length_dict[x])
# zipcode
closed_loans2['zip_code'] = closed_loans2['zip_code'].apply(lambda x: int(x[0:3]))
# subgrade
closed_loans2['sub_grade'] = closed_loans2['grade'] + closed_loans2['sub_grade'].apply(lambda x: float(list(x)[1])/10)
# house
closed_loans2['home_ownership'] = closed_loans2['home_ownership'].apply(lambda x: house_dict[x])
# purpose
closed_loans2['purpose'] = closed_loans2['purpose'].apply(lambda x: purpose_dict[x])
# states
closed_loans2['addr_state'] = closed_loans2['addr_state'].apply(lambda x: state_dict[x])
# make NA's, inf and -inf 0
closed_loans2 = closed_loans2.fillna(0)
closed_loans2 = closed_loans2.replace([np.inf, -np.inf], 0)
closed_loans_scaled = scaler.transform(closed_loans2)
closed_loans_scaled = pd.DataFrame(closed_loans_scaled, columns=closed_loans2.columns)
closed_loans_scaled.index = closed_loans2.index
In [352]:
closed_loans_scaled
Out[352]:
In [396]:
loans['roi'] = ((loans['total_rec_int'] + loans['total_rec_prncp']
+ loans['total_rec_late_fee'] + loans['recoveries']) / loans['funded_amnt']) -1
In [407]:
prof_loans = loans[loans['id'].isin(closed_loans['loan_status'][y_score[:,1] > 0.9].index.tolist())]
In [399]:
roi = loans.groupby('grade')['roi'].mean()
In [401]:
prof_loans = loans[loans['id'].isin(closed_loans.index.tolist())]
In [413]:
roi = prof_loans.groupby('grade')['roi'].mean()
print(roi)
print(prof_loans['roi'].mean())
In [424]:
prof_loans['grade'] = prof_loans['grade'].astype('category', ordered=True)
sns.barplot(data=roi.reset_index(), x='grade', y='roi', color='gray')
plt.show()
roi = prof_loans.groupby('loan_status')['roi'].mean()
sns.barplot(data=roi.reset_index(), x='loan_status', y='roi')
plt.show()
roi = prof_loans.groupby(['grade', 'loan_status'])['roi'].mean()
sns.barplot(data=roi.reset_index(), x='roi', y='grade', hue='loan_status', orient='h')
plt.show()
sns.countplot(data=prof_loans, x='grade', hue='loan_status')
plt.show()
In [409]:
prof_loans
Out[409]:
In [393]:
closed_loans.index.tolist()
Out[393]:
In [388]:
y_score = clf.predict_proba(closed_loans_scaled)
prediction = clf.predict(closed_loans_scaled)
confusion_matrix = ConfusionMatrix(np.array(closed_loans['loan_status'][y_score[:,1] > 0.9]), prediction[y_score[:,1] > 0.9])
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[388]:
In [386]:
np.array(closed_loans['loan_status'][y_score[:,1] > 0.9])
Out[386]:
In [377]:
prediction[y_score[:,1] > 0.9]
Out[377]:
In [374]:
y_total[y_score[:,1] > 0.9]
Out[374]:
In [371]:
prediction[y_score[:,1] > 0.9]
Out[371]:
In [ ]:
In [ ]:
In [414]:
X_total = pd.concat([X_train_scaled, X_test_scaled])
y_total = pd.concat([y_train, y_test])
In [415]:
y_score = clf.predict_proba(X_total)
prediction = clf.predict(X_total)
confusion_matrix = ConfusionMatrix(y_total[y_score[:,1] > 0.9], prediction[y_score[:,1] > 0.9])
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[415]:
In [416]:
diff_mean = X_total[y_score[:,1] > 0.9].mean() - X_total.mean()
abs(diff_mean).sort_values(ascending=False)
Out[416]:
In [421]:
for col in X_total.columns:
result = ttest_ind(X_total[y_score[:,1] > 0.9][col], X_total[col])
print(col, ':', result)
#X_total[y_score[:,1] > 0.9].mean() - X_total.mean()
In [418]:
X_total[y_score[:,1] > 0.9]['term']
Out[418]:
In [320]:
X_total.mean()
Out[320]:
In [309]:
X_total #most interesting features: 'int_rate', 'annual_inc', 'sub_grade', 'term', 'dti'
# vergelijken predicted > 0.9 vs. all?
Out[309]:
In [ ]:
In [ ]:
In [40]:
sum(y_score[:,1] > 0.5) / len(y_score[:,1] )
Out[40]:
In [41]:
max(y_score[~prediction, 1])
Out[41]:
In [74]:
diff_thres = y_score[:,1] > 0.18
In [79]:
print(f1_score(y_test, diff_thres, average='weighted'))
confusion_matrix = ConfusionMatrix(y_test, diff_thres)
confusion_matrix.print_stats()
confusion_matrix.plot()
Out[79]:
In [ ]:
In [ ]:
Also with the random forest algorithm for both only grade and all features we find the same accuracy measurements as measured with cross-validation on the training set. Therefore the logistic regression algorithm with all features still performance the best, although it performs not very well.
In [46]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train.loc[:,['grade']], y_train)
prediction = clf.predict(X_test.loc[:,['grade']])
print(f1_score(y_test, prediction, average='weighted'))
confusion_matrix = ConfusionMatrix(y_test, prediction)
print(confusion_matrix)
confusion_matrix.plot()
Out[46]:
In [47]:
fpr, tpr, thresholds = roc_curve(y_test, prediction, pos_label=True)
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[47]:
In [48]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print(f1_score(y_test, prediction, average='weighted'))
confusion_matrix = ConfusionMatrix(y_test, prediction)
print(confusion_matrix)
confusion_matrix.plot()
Out[48]:
In [49]:
y_score = clf.predict_proba(X_test)
print(clf.classes_)
fpr, tpr, thresholds = roc_curve(y_test, y_score[:,1], pos_label=True)
print(auc(fpr, tpr))
plt.plot(fpr, tpr)
Out[49]:
In [50]:
X_train, X_test, y_train, y_test = train_test_split(closed_loans.iloc[:, 0:24],
closed_loans['grade'], test_size=0.3,
random_state=123, stratify=closed_loans['loan_status'])
X_train = X_train.drop(['grade', 'sub_grade', 'int_rate'], axis=1)
X_test = X_test.drop(['grade', 'sub_grade', 'int_rate'], axis=1)
In [51]:
# features that are not float or int, so not to be converted:
# date:
# earliest_cr_line
# ordered:
# emp_length, zip_code, term
# unordered:
# home_ownership, purpose, addr_state (ordered geographically)
# date
X_train['earliest_cr_line'] = pd.to_datetime(X_train['earliest_cr_line']).dt.strftime("%s")
X_train['earliest_cr_line'] = [0 if date=='NaT' else int(date) for date in X_train['earliest_cr_line']]
# term
X_train['term'] = X_train['term'].apply(lambda x: int(x.split(' ')[1]))
# emp_length
emp_length_dict = {'n/a':0,
'< 1 year':0,
'1 year':1,
'2 years':2,
'3 years':3,
'4 years':4,
'5 years':5,
'6 years':6,
'7 years':7,
'8 years':8,
'9 years':9,
'10+ years':10}
X_train['emp_length'] = X_train['emp_length'].apply(lambda x: emp_length_dict[x])
# zipcode
X_train['zip_code'] = X_train['zip_code'].apply(lambda x: int(x[0:3]))
# house
house_dict = {'NONE': 0, 'OTHER': 0, 'ANY': 0, 'RENT': 1, 'MORTGAGE': 2, 'OWN': 3}
X_train['home_ownership'] = X_train['home_ownership'].apply(lambda x: house_dict[x])
# purpose
purpose_dict = {'other': 0, 'small_business': 1, 'renewable_energy': 2, 'home_improvement': 3,
'house': 4, 'educational': 5, 'medical': 6, 'moving': 7, 'car': 8,
'major_purchase': 9, 'wedding': 10, 'vacation': 11, 'credit_card': 12,
'debt_consolidation': 13}
X_train['purpose'] = X_train['purpose'].apply(lambda x: purpose_dict[x])
# states
state_dict = {'AK': 0, 'WA': 1, 'ID': 2, 'MT': 3, 'ND': 4, 'MN': 5,
'OR': 6, 'WY': 7, 'SD': 8, 'WI': 9, 'MI': 10, 'NY': 11,
'VT': 12, 'NH': 13, 'MA': 14, 'CT': 15, 'RI': 16, 'ME': 17,
'CA': 18, 'NV': 19, 'UT': 20, 'CO': 21, 'NE': 22, 'IA': 23,
'KS': 24, 'MO': 25, 'IL': 26, 'IN': 27, 'OH': 28, 'PA': 29,
'NJ': 30, 'KY': 31, 'WV': 32, 'VA': 33, 'DC': 34, 'MD': 35,
'DE': 36, 'AZ': 37, 'NM': 38, 'OK': 39, 'AR': 40, 'TN': 41,
'NC': 42, 'TX': 43, 'LA': 44, 'MS': 45, 'AL': 46, 'GA': 47,
'SC': 48, 'FL': 49, 'HI': 50}
X_train['addr_state'] = X_train['addr_state'].apply(lambda x: state_dict[x])
# make NA's, inf and -inf 0
X_train = X_train.fillna(0)
X_train = X_train.replace([np.inf, -np.inf], 0)
# date
X_test['earliest_cr_line'] = pd.to_datetime(X_test['earliest_cr_line']).dt.strftime("%s")
X_test['earliest_cr_line'] = [0 if date=='NaT' else int(date) for date in X_test['earliest_cr_line']]
# term
X_test['term'] = X_test['term'].apply(lambda x: int(x.split(' ')[1]))
# emp_length
X_test['emp_length'] = X_test['emp_length'].apply(lambda x: emp_length_dict[x])
# zipcode
X_test['zip_code'] = X_test['zip_code'].apply(lambda x: int(x[0:3]))
# house
X_test['home_ownership'] = X_test['home_ownership'].apply(lambda x: house_dict[x])
# purpose
X_test['purpose'] = X_test['purpose'].apply(lambda x: purpose_dict[x])
# states
X_test['addr_state'] = X_test['addr_state'].apply(lambda x: state_dict[x])
# make NA's, inf and -inf 0
X_test = X_test.fillna(0)
X_test = X_test.replace([np.inf, -np.inf], 0)
In [52]:
from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
In [53]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
lb = LabelBinarizer()
grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
lb.fit(grades)
y_train_2 = lb.transform(y_train)
clf = OneVsRestClassifier(LogisticRegression(penalty='l1'))
predict_y = clf.fit(X_train_scaled, y_train_2).predict(X_test_scaled)
predict_y = lb.inverse_transform(predict_y)
#print(accuracy_score(y_test, predict_y))
confusion_matrix = ConfusionMatrix(np.array(y_test, dtype='<U1'), predict_y)
confusion_matrix.plot()
confusion_matrix.print_stats()
# find index of top 5 highest coefficients, aka most used features for prediction
coefs = clf.coef_
positions = abs(coefs[0]).argsort()[-5:][::-1]
print(X_train_scaled.columns[positions])
print(coefs[0][positions])
In [54]:
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100))
predict_y = clf.fit(X_train_scaled, y_train_2).predict(X_test_scaled)
predict_y = lb.inverse_transform(predict_y)
print(accuracy_score(y_test, predict_y))
confusion_matrix = ConfusionMatrix(np.array(y_test, dtype='<U1'), predict_y)
confusion_matrix.plot()
print(confusion_matrix)
In [55]:
confusion_matrix.print_stats()
In [56]:
features = []
for i,j in enumerate(grades):
print('\n',j)
feat_imp = clf.estimators_[i].feature_importances_
positions = abs(feat_imp).argsort()[-5:][::-1]
features.extend(list(X_train.columns[positions]))
print(X_train.columns[positions])
print(feat_imp[positions])
In [57]:
pd.Series(features).value_counts()
Out[57]:
In [ ]:
In [ ]:
In [ ]: