notebook.community

Edit and run



In [36]:

    
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf



In [37]:

    
df3a = pd.read_csv('LoanStats3a.csv',skiprows=1,skipfooter=2)









    



/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning:

Falling back to the 'python' engine because the 'c' engine does not support skip_footer; you can avoid this warning by specifying engine='python'.



In [38]:

    
df3a.head()









    Out[38]:






  
    
      
      id
      member_id
      loan_amnt
      funded_amnt
      funded_amnt_inv
      term
      int_rate
      installment
      grade
      sub_grade
      ...
      num_tl_90g_dpd_24m
      num_tl_op_past_12m
      pct_tl_nvr_dlq
      percent_bc_gt_75
      pub_rec_bankruptcies
      tax_liens
      tot_hi_cred_lim
      total_bal_ex_mort
      total_bc_limit
      total_il_high_credit_limit
    
  
  
    
      0
      1077501
      1296599.0
      5000.0
      5000.0
      4975.0
      36 months
      10.65%
      162.87
      B
      B2
      ...
      NaN
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
    
    
      1
      1077430
      1314167.0
      2500.0
      2500.0
      2500.0
      60 months
      15.27%
      59.83
      C
      C4
      ...
      NaN
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
    
    
      2
      1077175
      1313524.0
      2400.0
      2400.0
      2400.0
      36 months
      15.96%
      84.33
      C
      C5
      ...
      NaN
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
    
    
      3
      1076863
      1277178.0
      10000.0
      10000.0
      10000.0
      36 months
      13.49%
      339.31
      C
      C1
      ...
      NaN
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
    
    
      4
      1075358
      1311748.0
      3000.0
      3000.0
      3000.0
      60 months
      12.69%
      67.79
      B
      B5
      ...
      NaN
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 111 columns



In [39]:

    
def convert_to_period( x):
        if x is not None:
            return pd.Period(x, 'M')



In [40]:

    
df3a.issue_d









    Out[40]:





0        Dec-2011
1        Dec-2011
2        Dec-2011
3        Dec-2011
4        Dec-2011
5        Dec-2011
6        Dec-2011
7        Dec-2011
8        Dec-2011
9        Dec-2011
10       Dec-2011
11       Dec-2011
12       Dec-2011
13       Dec-2011
14       Dec-2011
15       Dec-2011
16       Dec-2011
17       Dec-2011
18       Dec-2011
19       Dec-2011
20       Dec-2011
21       Dec-2011
22       Dec-2011
23       Dec-2011
24       Dec-2011
25       Dec-2011
26       Dec-2011
27       Dec-2011
28       Dec-2011
29       Dec-2011
           ...   
42506    Jul-2007
42507    Jul-2007
42508    Jul-2007
42509    Jul-2007
42510    Jul-2007
42511    Jul-2007
42512    Jul-2007
42513    Jun-2007
42514    Jun-2007
42515    Jun-2007
42516    Jun-2007
42517    Jun-2007
42518    Jun-2007
42519    Jun-2007
42520    Jun-2007
42521    Jun-2007
42522    Jun-2007
42523    Jun-2007
42524    Jun-2007
42525    Jun-2007
42526    Jun-2007
42527    Jun-2007
42528    Jun-2007
42529    Jun-2007
42530    Jun-2007
42531    Jun-2007
42532    Jun-2007
42533    Jun-2007
42534    Jun-2007
42535    Jun-2007
Name: issue_d, dtype: object



In [41]:

    
pd.isnull(df3a.earliest_cr_line).value_counts()









    Out[41]:





False    42506
True        30
Name: earliest_cr_line, dtype: int64



In [18]:

    
#df3a[['mths_since_last_delinq', 'mths_since_last_record', 'issue_d', 'earliest_cr_line', 'issue_d']]
df3a.earliest_cr_line = df3a.earliest_cr_line.apply(lambda x: convert_to_period(x))
#df3a.issue_d-df3a.earliest_cr_line



In [19]:

    
df3a.issue_d=df3a.issue_d.apply(lambda x: convert_to_period(x))



In [27]:

    
df3a['cr_length_prior']=(df3a.issue_d-df3a.earliest_cr_line).astype(float).fillna(0.0)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-27-e9c65887397e> in <module>()
----> 1 df3a['cr_length_prior']=(df3a.issue_d-df3a.earliest_cr_line).astype(float).fillna(0.0)

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py in astype(self, dtype, copy, raise_on_error, **kwargs)
   2948 
   2949         mgr = self._data.astype(dtype=dtype, copy=copy,
-> 2950                                 raise_on_error=raise_on_error, **kwargs)
   2951         return self._constructor(mgr).__finalize__(self)
   2952 

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
   2936 
   2937     def astype(self, dtype, **kwargs):
-> 2938         return self.apply('astype', dtype=dtype, **kwargs)
   2939 
   2940     def convert(self, **kwargs):

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, raw, **kwargs)
   2888 
   2889             kwargs['mgr'] = self
-> 2890             applied = getattr(b, f)(**kwargs)
   2891             result_blocks = _extend_blocks(applied, result_blocks)
   2892 

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in astype(self, dtype, copy, raise_on_error, values, **kwargs)
    432                **kwargs):
    433         return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
--> 434                             values=values, **kwargs)
    435 
    436     def _astype(self, dtype, copy=False, raise_on_error=True, values=None,

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, raise_on_error, values, klass, mgr, **kwargs)
    475 
    476                 # _astype_nansafe works fine with 1-d only
--> 477                 values = com._astype_nansafe(values.ravel(), dtype, copy=True)
    478                 values = values.reshape(self.shape)
    479 

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/common.py in _astype_nansafe(arr, dtype, copy)
   1918 
   1919     if copy:
-> 1920         return arr.astype(dtype)
   1921     return arr.view(dtype)
   1922 

TypeError: float() argument must be a string or a number, not 'pandas._period.Period'



In [5]:

    
pd.isnull(df3a.tax_liens).value_counts()









    Out[5]:





False    42430
True       106
Name: tax_liens, dtype: int64



In [6]:

    
df3a.tax_liens.hist()









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x10644a860>



In [70]:

    
df3a.recoveries.unique()









    Out[70]:





array([    0.  ,   117.08,   189.06, ...,    28.63,   872.89,  4897.92])



In [72]:

    
df3a.chargeoff_within_12_mths.value_counts()









    Out[72]:





0.0    42390
Name: chargeoff_within_12_mths, dtype: int64



In [ ]:



In [ ]:



In [ ]:

Since there are 111 columns , lets first find columns with null/empty through out so that we can quickly clean them up. We go for 111 columns to 57 columns which have some data.



In [45]:

    
df3a.dropna(axis=1, how='all',inplace=True)



In [46]:

    
df3a.columns.tolist()









    Out[46]:





['id',
 'member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'issue_d',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'initial_list_status',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'next_pymnt_d',
 'last_credit_pull_d',
 'collections_12_mths_ex_med',
 'policy_code',
 'application_type',
 'acc_now_delinq',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'pub_rec_bankruptcies',
 'tax_liens']

Now we can look for data to do some exploratory stuff.



In [22]:

    
df3a.application_type.unique() ##Only one type, may not be that useful so not keeping it.









    Out[22]:





array(['INDIVIDUAL', None], dtype=object)



In [23]:

    
cols_to_keep=['id','loan_amnt','funded_amnt','funded_amnt_inv','term','int_rate','installment','grade','sub_grade','emp_title','emp_length','home_ownership','annual_inc','verification_status','issue_d','pymnt_plan','purpose','title','addr_state','dti','delinq_2yrs','earliest_cr_line','inq_last_6mths','mths_since_last_delinq','mths_since_last_record','open_acc','pub_rec','revol_bal','revol_util','total_acc','initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d','collections_12_mths_ex_med','mths_since_last_major_derog','policy_code','loan_status']



In [72]:

    
df3a.home_ownership.unique()









    Out[72]:





array(['RENT', 'OWN', 'MORTGAGE', 'OTHER', 'NONE'], dtype=object)



In [24]:

    
df3a=df3a[cols_to_keep]



In [25]:

    
df3a.loan_status.unique()









    Out[25]:





array(['Fully Paid', 'Charged Off', 'Current', 'In Grace Period',
       'Late (31-120 days)', 'Late (16-30 days)', 'Default', None,
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'], dtype=object)

Remove the rows with NA for loan status



In [26]:

    
df3a.dropna(subset=['loan_status'],how='any',inplace=True)



In [27]:

    
df3a.loan_status.value_counts().plot(kind='bar')









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x118aa9828>



In [28]:

    
default_status = {'Fully Paid':1,'Charged Off':0,'Current':1,'In Grace Period':0,'Late (31-120 days)':0,'Default':0,'Does not meet the credit policy. Status:Fully Paid':1,'Does not meet the credit policy. Status:Charged Off':0,'Late (16-30 days)':0}



In [29]:

    
df3a['defaultYN']=df3a.loan_status.map(default_status)



In [30]:

    
df3a.defaultYN.value_counts()









    Out[30]:





1    36085
0     6450
Name: defaultYN, dtype: int64



In [31]:

    
#sns.countplot(x="defaultYN",data=df3a)
df3a.grade.value_counts()









    Out[31]:





B    12389
A    10183
C     8740
D     6016
E     3394
F     1301
G      512
Name: grade, dtype: int64



In [32]:

    
sns.distplot(df3a.loan_amnt)









    



/Users/Satish/anaconda/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning:

using a non-integer number instead of an integer will result in an error in the future







    Out[32]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a03eac8>



In [11]:

    
sns.distplot(df3a.annual_inc)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-3c20795ac625> in <module>()
----> 1 sns.distplot(df3a.annual_inc)

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/seaborn/distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
    204     if hist:
    205         if bins is None:
--> 206             bins = min(_freedman_diaconis_bins(a), 50)
    207         hist_kws.setdefault("alpha", 0.4)
    208         hist_kws.setdefault("normed", norm_hist)

/Users/satishterala/anaconda2/envs/py35/lib/python3.5/site-packages/seaborn/distributions.py in _freedman_diaconis_bins(a)
     30         return int(np.sqrt(a.size))
     31     else:
---> 32         return int(np.ceil((a.max() - a.min()) / h))
     33 
     34 

ValueError: cannot convert float NaN to integer



In [34]:

    
sns.countplot(df3a.defaultYN)









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a0ef630>



In [ ]:

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	...	num_tl_90g_dpd_24m	num_tl_op_past_12m	pct_tl_nvr_dlq	percent_bc_gt_75	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit
0	1077501	1296599.0	5000.0	5000.0	4975.0	36 months	10.65%	162.87	B	B2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1077430	1314167.0	2500.0	2500.0	2500.0	60 months	15.27%	59.83	C	C4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1077175	1313524.0	2400.0	2400.0	2400.0	36 months	15.96%	84.33	C	C5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1076863	1277178.0	10000.0	10000.0	10000.0	36 months	13.49%	339.31	C	C1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1075358	1311748.0	3000.0	3000.0	3000.0	60 months	12.69%	67.79	B	B5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN