In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
In [37]:
df3a = pd.read_csv('LoanStats3a.csv',skiprows=1,skipfooter=2)
In [38]:
df3a.head()
Out[38]:
In [39]:
def convert_to_period( x):
if x is not None:
return pd.Period(x, 'M')
In [40]:
df3a.issue_d
Out[40]:
In [41]:
pd.isnull(df3a.earliest_cr_line).value_counts()
Out[41]:
In [18]:
#df3a[['mths_since_last_delinq', 'mths_since_last_record', 'issue_d', 'earliest_cr_line', 'issue_d']]
df3a.earliest_cr_line = df3a.earliest_cr_line.apply(lambda x: convert_to_period(x))
#df3a.issue_d-df3a.earliest_cr_line
In [19]:
df3a.issue_d=df3a.issue_d.apply(lambda x: convert_to_period(x))
In [27]:
df3a['cr_length_prior']=(df3a.issue_d-df3a.earliest_cr_line).astype(float).fillna(0.0)
In [5]:
pd.isnull(df3a.tax_liens).value_counts()
Out[5]:
In [6]:
df3a.tax_liens.hist()
Out[6]:
In [70]:
df3a.recoveries.unique()
Out[70]:
In [72]:
df3a.chargeoff_within_12_mths.value_counts()
Out[72]:
In [ ]:
In [ ]:
In [ ]:
Since there are 111 columns , lets first find columns with null/empty through out so that we can quickly clean them up. We go for 111 columns to 57 columns which have some data.
In [45]:
df3a.dropna(axis=1, how='all',inplace=True)
In [46]:
df3a.columns.tolist()
Out[46]:
Now we can look for data to do some exploratory stuff.
In [22]:
df3a.application_type.unique() ##Only one type, may not be that useful so not keeping it.
Out[22]:
In [23]:
cols_to_keep=['id','loan_amnt','funded_amnt','funded_amnt_inv','term','int_rate','installment','grade','sub_grade','emp_title','emp_length','home_ownership','annual_inc','verification_status','issue_d','pymnt_plan','purpose','title','addr_state','dti','delinq_2yrs','earliest_cr_line','inq_last_6mths','mths_since_last_delinq','mths_since_last_record','open_acc','pub_rec','revol_bal','revol_util','total_acc','initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d','collections_12_mths_ex_med','mths_since_last_major_derog','policy_code','loan_status']
In [72]:
df3a.home_ownership.unique()
Out[72]:
In [24]:
df3a=df3a[cols_to_keep]
In [25]:
df3a.loan_status.unique()
Out[25]:
Remove the rows with NA for loan status
In [26]:
df3a.dropna(subset=['loan_status'],how='any',inplace=True)
In [27]:
df3a.loan_status.value_counts().plot(kind='bar')
Out[27]:
In [28]:
default_status = {'Fully Paid':1,'Charged Off':0,'Current':1,'In Grace Period':0,'Late (31-120 days)':0,'Default':0,'Does not meet the credit policy. Status:Fully Paid':1,'Does not meet the credit policy. Status:Charged Off':0,'Late (16-30 days)':0}
In [29]:
df3a['defaultYN']=df3a.loan_status.map(default_status)
In [30]:
df3a.defaultYN.value_counts()
Out[30]:
In [31]:
#sns.countplot(x="defaultYN",data=df3a)
df3a.grade.value_counts()
Out[31]:
In [32]:
sns.distplot(df3a.loan_amnt)
Out[32]:
In [11]:
sns.distplot(df3a.annual_inc)
In [34]:
sns.countplot(df3a.defaultYN)
Out[34]:
In [ ]: