In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf

In [37]:
df3a = pd.read_csv('LoanStats3a.csv',skiprows=1,skipfooter=2)


/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning:

Falling back to the 'python' engine because the 'c' engine does not support skip_footer; you can avoid this warning by specifying engine='python'.


In [38]:
df3a.head()


Out[38]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75 pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit total_il_high_credit_limit
0 1077501 1296599.0 5000.0 5000.0 4975.0 36 months 10.65% 162.87 B B2 ... NaN NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN
1 1077430 1314167.0 2500.0 2500.0 2500.0 60 months 15.27% 59.83 C C4 ... NaN NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN
2 1077175 1313524.0 2400.0 2400.0 2400.0 36 months 15.96% 84.33 C C5 ... NaN NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN
3 1076863 1277178.0 10000.0 10000.0 10000.0 36 months 13.49% 339.31 C C1 ... NaN NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN
4 1075358 1311748.0 3000.0 3000.0 3000.0 60 months 12.69% 67.79 B B5 ... NaN NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN

5 rows × 111 columns


In [39]:
def convert_to_period( x):
        if x is not None:
            return pd.Period(x, 'M')

In [40]:
df3a.issue_d


Out[40]:
0        Dec-2011
1        Dec-2011
2        Dec-2011
3        Dec-2011
4        Dec-2011
5        Dec-2011
6        Dec-2011
7        Dec-2011
8        Dec-2011
9        Dec-2011
10       Dec-2011
11       Dec-2011
12       Dec-2011
13       Dec-2011
14       Dec-2011
15       Dec-2011
16       Dec-2011
17       Dec-2011
18       Dec-2011
19       Dec-2011
20       Dec-2011
21       Dec-2011
22       Dec-2011
23       Dec-2011
24       Dec-2011
25       Dec-2011
26       Dec-2011
27       Dec-2011
28       Dec-2011
29       Dec-2011
           ...   
42506    Jul-2007
42507    Jul-2007
42508    Jul-2007
42509    Jul-2007
42510    Jul-2007
42511    Jul-2007
42512    Jul-2007
42513    Jun-2007
42514    Jun-2007
42515    Jun-2007
42516    Jun-2007
42517    Jun-2007
42518    Jun-2007
42519    Jun-2007
42520    Jun-2007
42521    Jun-2007
42522    Jun-2007
42523    Jun-2007
42524    Jun-2007
42525    Jun-2007
42526    Jun-2007
42527    Jun-2007
42528    Jun-2007
42529    Jun-2007
42530    Jun-2007
42531    Jun-2007
42532    Jun-2007
42533    Jun-2007
42534    Jun-2007
42535    Jun-2007
Name: issue_d, dtype: object

In [41]:
pd.isnull(df3a.earliest_cr_line).value_counts()


Out[41]:
False    42506
True        30
Name: earliest_cr_line, dtype: int64

In [18]:
#df3a[['mths_since_last_delinq', 'mths_since_last_record', 'issue_d', 'earliest_cr_line', 'issue_d']]
df3a.earliest_cr_line = df3a.earliest_cr_line.apply(lambda x: convert_to_period(x))
#df3a.issue_d-df3a.earliest_cr_line

In [19]:
df3a.issue_d=df3a.issue_d.apply(lambda x: convert_to_period(x))

In [27]:
df3a['cr_length_prior']=(df3a.issue_d-df3a.earliest_cr_line).astype(float).fillna(0.0)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-27-e9c65887397e> in <module>()
----> 1 df3a['cr_length_prior']=(df3a.issue_d-df3a.earliest_cr_line).astype(float).fillna(0.0)

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py in astype(self, dtype, copy, raise_on_error, **kwargs)
   2948 
   2949         mgr = self._data.astype(dtype=dtype, copy=copy,
-> 2950                                 raise_on_error=raise_on_error, **kwargs)
   2951         return self._constructor(mgr).__finalize__(self)
   2952 

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
   2936 
   2937     def astype(self, dtype, **kwargs):
-> 2938         return self.apply('astype', dtype=dtype, **kwargs)
   2939 
   2940     def convert(self, **kwargs):

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, raw, **kwargs)
   2888 
   2889             kwargs['mgr'] = self
-> 2890             applied = getattr(b, f)(**kwargs)
   2891             result_blocks = _extend_blocks(applied, result_blocks)
   2892 

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in astype(self, dtype, copy, raise_on_error, values, **kwargs)
    432                **kwargs):
    433         return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
--> 434                             values=values, **kwargs)
    435 
    436     def _astype(self, dtype, copy=False, raise_on_error=True, values=None,

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, raise_on_error, values, klass, mgr, **kwargs)
    475 
    476                 # _astype_nansafe works fine with 1-d only
--> 477                 values = com._astype_nansafe(values.ravel(), dtype, copy=True)
    478                 values = values.reshape(self.shape)
    479 

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/common.py in _astype_nansafe(arr, dtype, copy)
   1918 
   1919     if copy:
-> 1920         return arr.astype(dtype)
   1921     return arr.view(dtype)
   1922 

TypeError: float() argument must be a string or a number, not 'pandas._period.Period'

In [5]:
pd.isnull(df3a.tax_liens).value_counts()


Out[5]:
False    42430
True       106
Name: tax_liens, dtype: int64

In [6]:
df3a.tax_liens.hist()


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x10644a860>

In [70]:
df3a.recoveries.unique()


Out[70]:
array([    0.  ,   117.08,   189.06, ...,    28.63,   872.89,  4897.92])

In [72]:
df3a.chargeoff_within_12_mths.value_counts()


Out[72]:
0.0    42390
Name: chargeoff_within_12_mths, dtype: int64

In [ ]:


In [ ]:


In [ ]:

Since there are 111 columns , lets first find columns with null/empty through out so that we can quickly clean them up. We go for 111 columns to 57 columns which have some data.


In [45]:
df3a.dropna(axis=1, how='all',inplace=True)

In [46]:
df3a.columns.tolist()


Out[46]:
['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'policy_code',
 'application_type',
 'acc_now_delinq',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'pub_rec_bankruptcies',
 'tax_liens']

Now we can look for data to do some exploratory stuff.


In [22]:
df3a.application_type.unique() ##Only one type, may not be that useful so not keeping it.


Out[22]:
array(['INDIVIDUAL', None], dtype=object)

In [23]:
cols_to_keep=['id','loan_amnt','funded_amnt','funded_amnt_inv','term','int_rate','installment','grade','sub_grade','emp_title','emp_length','home_ownership','annual_inc','verification_status','issue_d','pymnt_plan','purpose','title','addr_state','dti','delinq_2yrs','earliest_cr_line','inq_last_6mths','mths_since_last_delinq','mths_since_last_record','open_acc','pub_rec','revol_bal','revol_util','total_acc','initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d','collections_12_mths_ex_med','mths_since_last_major_derog','policy_code','loan_status']

In [72]:
df3a.home_ownership.unique()


Out[72]:
array(['RENT', 'OWN', 'MORTGAGE', 'OTHER', 'NONE'], dtype=object)

In [24]:
df3a=df3a[cols_to_keep]

In [25]:
df3a.loan_status.unique()


Out[25]:
array(['Fully Paid', 'Charged Off', 'Current', 'In Grace Period',
       'Late (31-120 days)', 'Late (16-30 days)', 'Default', None,
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'], dtype=object)

Remove the rows with NA for loan status


In [26]:
df3a.dropna(subset=['loan_status'],how='any',inplace=True)

In [27]:
df3a.loan_status.value_counts().plot(kind='bar')


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x118aa9828>

In [28]:
default_status = {'Fully Paid':1,'Charged Off':0,'Current':1,'In Grace Period':0,'Late (31-120 days)':0,'Default':0,'Does not meet the credit policy. Status:Fully Paid':1,'Does not meet the credit policy. Status:Charged Off':0,'Late (16-30 days)':0}

In [29]:
df3a['defaultYN']=df3a.loan_status.map(default_status)

In [30]:
df3a.defaultYN.value_counts()


Out[30]:
1    36085
0     6450
Name: defaultYN, dtype: int64

In [31]:
#sns.countplot(x="defaultYN",data=df3a)
df3a.grade.value_counts()


Out[31]:
B    12389
A    10183
C     8740
D     6016
E     3394
F     1301
G      512
Name: grade, dtype: int64

In [32]:
sns.distplot(df3a.loan_amnt)


/Users/Satish/anaconda/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning:

using a non-integer number instead of an integer will result in an error in the future

Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a03eac8>

In [11]:
sns.distplot(df3a.annual_inc)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-3c20795ac625> in <module>()
----> 1 sns.distplot(df3a.annual_inc)

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/seaborn/distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
    204     if hist:
    205         if bins is None:
--> 206             bins = min(_freedman_diaconis_bins(a), 50)
    207         hist_kws.setdefault("alpha", 0.4)
    208         hist_kws.setdefault("normed", norm_hist)

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/seaborn/distributions.py in _freedman_diaconis_bins(a)
     30         return int(np.sqrt(a.size))
     31     else:
---> 32         return int(np.ceil((a.max() - a.min()) / h))
     33 
     34 

ValueError: cannot convert float NaN to integer

In [34]:
sns.countplot(df3a.defaultYN)


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a0ef630>

In [ ]: