notebook.community

Edit and run



In [15]:

    
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [16]:

    
data = pd.read_csv("cs-training.csv")



In [17]:

    
data.describe()









    Out[17]:







  
    
      
      Unnamed: 0
      SeriousDlqin2yrs
      RevolvingUtilizationOfUnsecuredLines
      age
      NumberOfTime30-59DaysPastDueNotWorse
      DebtRatio
      MonthlyIncome
      NumberOfOpenCreditLinesAndLoans
      NumberOfTimes90DaysLate
      NumberRealEstateLoansOrLines
      NumberOfTime60-89DaysPastDueNotWorse
      NumberOfDependents
    
  
  
    
      count
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      1.202690e+05
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      146076.000000
    
    
      mean
      75000.500000
      0.066840
      6.048438
      52.295207
      0.421033
      353.005076
      6.670221e+03
      8.452760
      0.265973
      1.018240
      0.240387
      0.757222
    
    
      std
      43301.414527
      0.249746
      249.755371
      14.771866
      4.192781
      2037.818523
      1.438467e+04
      5.145951
      4.169304
      1.129771
      4.155179
      1.115086
    
    
      min
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      37500.750000
      0.000000
      0.029867
      41.000000
      0.000000
      0.175074
      3.400000e+03
      5.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      75000.500000
      0.000000
      0.154181
      52.000000
      0.000000
      0.366508
      5.400000e+03
      8.000000
      0.000000
      1.000000
      0.000000
      0.000000
    
    
      75%
      112500.250000
      0.000000
      0.559046
      63.000000
      0.000000
      0.868254
      8.249000e+03
      11.000000
      0.000000
      2.000000
      0.000000
      1.000000
    
    
      max
      150000.000000
      1.000000
      50708.000000
      109.000000
      98.000000
      329664.000000
      3.008750e+06
      58.000000
      98.000000
      54.000000
      98.000000
      20.000000



In [18]:

    
data=data.drop('Unnamed: 0', axis = 1)



In [19]:

    
data.describe()









    Out[19]:







  
    
      
      SeriousDlqin2yrs
      RevolvingUtilizationOfUnsecuredLines
      age
      NumberOfTime30-59DaysPastDueNotWorse
      DebtRatio
      MonthlyIncome
      NumberOfOpenCreditLinesAndLoans
      NumberOfTimes90DaysLate
      NumberRealEstateLoansOrLines
      NumberOfTime60-89DaysPastDueNotWorse
      NumberOfDependents
    
  
  
    
      count
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      1.202690e+05
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      146076.000000
    
    
      mean
      0.066840
      6.048438
      52.295207
      0.421033
      353.005076
      6.670221e+03
      8.452760
      0.265973
      1.018240
      0.240387
      0.757222
    
    
      std
      0.249746
      249.755371
      14.771866
      4.192781
      2037.818523
      1.438467e+04
      5.145951
      4.169304
      1.129771
      4.155179
      1.115086
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.029867
      41.000000
      0.000000
      0.175074
      3.400000e+03
      5.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      0.154181
      52.000000
      0.000000
      0.366508
      5.400000e+03
      8.000000
      0.000000
      1.000000
      0.000000
      0.000000
    
    
      75%
      0.000000
      0.559046
      63.000000
      0.000000
      0.868254
      8.249000e+03
      11.000000
      0.000000
      2.000000
      0.000000
      1.000000
    
    
      max
      1.000000
      50708.000000
      109.000000
      98.000000
      329664.000000
      3.008750e+06
      58.000000
      98.000000
      54.000000
      98.000000
      20.000000



In [20]:

    
data.columns









    Out[20]:





Index([u'SeriousDlqin2yrs', u'RevolvingUtilizationOfUnsecuredLines', u'age',
       u'NumberOfTime30-59DaysPastDueNotWorse', u'DebtRatio', u'MonthlyIncome',
       u'NumberOfOpenCreditLinesAndLoans', u'NumberOfTimes90DaysLate',
       u'NumberRealEstateLoansOrLines',
       u'NumberOfTime60-89DaysPastDueNotWorse', u'NumberOfDependents'],
      dtype='object')



In [21]:

    
cleanCol = []
for i in range(len(data.columns)):
    cleanCol.append(data.columns[i].replace('-', ''))



In [22]:

    
cleanCol









    Out[22]:





['SeriousDlqin2yrs',
 'RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime3059DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime6089DaysPastDueNotWorse',
 'NumberOfDependents']



In [23]:

    
data.columns = cleanCol



In [24]:

    
data.describe()









    Out[24]:







  
    
      
      SeriousDlqin2yrs
      RevolvingUtilizationOfUnsecuredLines
      age
      NumberOfTime3059DaysPastDueNotWorse
      DebtRatio
      MonthlyIncome
      NumberOfOpenCreditLinesAndLoans
      NumberOfTimes90DaysLate
      NumberRealEstateLoansOrLines
      NumberOfTime6089DaysPastDueNotWorse
      NumberOfDependents
    
  
  
    
      count
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      1.202690e+05
      150000.000000
      150000.000000
      150000.000000
      150000.000000
      146076.000000
    
    
      mean
      0.066840
      6.048438
      52.295207
      0.421033
      353.005076
      6.670221e+03
      8.452760
      0.265973
      1.018240
      0.240387
      0.757222
    
    
      std
      0.249746
      249.755371
      14.771866
      4.192781
      2037.818523
      1.438467e+04
      5.145951
      4.169304
      1.129771
      4.155179
      1.115086
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000e+00
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.029867
      41.000000
      0.000000
      0.175074
      3.400000e+03
      5.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      0.154181
      52.000000
      0.000000
      0.366508
      5.400000e+03
      8.000000
      0.000000
      1.000000
      0.000000
      0.000000
    
    
      75%
      0.000000
      0.559046
      63.000000
      0.000000
      0.868254
      8.249000e+03
      11.000000
      0.000000
      2.000000
      0.000000
      1.000000
    
    
      max
      1.000000
      50708.000000
      109.000000
      98.000000
      329664.000000
      3.008750e+06
      58.000000
      98.000000
      54.000000
      98.000000
      20.000000



In [25]:

    
data.head(5)









    Out[25]:







  
    
      
      SeriousDlqin2yrs
      RevolvingUtilizationOfUnsecuredLines
      age
      NumberOfTime3059DaysPastDueNotWorse
      DebtRatio
      MonthlyIncome
      NumberOfOpenCreditLinesAndLoans
      NumberOfTimes90DaysLate
      NumberRealEstateLoansOrLines
      NumberOfTime6089DaysPastDueNotWorse
      NumberOfDependents
    
  
  
    
      0
      1
      0.766127
      45
      2
      0.802982
      9120.0
      13
      0
      6
      0
      2.0
    
    
      1
      0
      0.957151
      40
      0
      0.121876
      2600.0
      4
      0
      0
      0
      1.0
    
    
      2
      0
      0.658180
      38
      1
      0.085113
      3042.0
      2
      1
      0
      0
      0.0
    
    
      3
      0
      0.233810
      30
      0
      0.036050
      3300.0
      5
      0
      0
      0
      0.0
    
    
      4
      0
      0.907239
      49
      1
      0.024926
      63588.0
      7
      0
      1
      0
      0.0



In [26]:

    
data.isnull().sum()









    Out[26]:





SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime3059DaysPastDueNotWorse         0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime6089DaysPastDueNotWorse         0
NumberOfDependents                       3924
dtype: int64



In [27]:

    
data['age'].hist(bins=100)









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6dfbcf3b10>






    



/opt/conda/lib/python2.7/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [28]:

    
data.age.describe()









    Out[28]:





count    150000.000000
mean         52.295207
std          14.771866
min           0.000000
25%          41.000000
50%          52.000000
75%          63.000000
max         109.000000
Name: age, dtype: float64



In [29]:

    
for i in range(0,110):
    print i, len(data[data.age == i])



In [30]:

    
"""age should be between a defined range, 0-109 makes less sense, should be between 22-91"""









    Out[30]:





'age should be between a defined range, 0-109 makes less sense, should be between 22-91'



In [31]:

    
np.median(data.age)
np.mean(data.age)









    Out[31]:





52.295206666666665



In [20]:

    
mean_age=np.mean(data.age)
ageNew=[]
for val in data.age:
    if val < 22 or val >91:
        ageNew.append(mean_age)
    else:
        ageNew.append(val)



In [21]:

    
ageNew









    Out[21]:





[45,
 40,
 38,
 30,
 49,
 74,
 57,
 39,
 27,
 57,
 30,
 51,
 46,
 40,
 76,
 64,
 78,
 53,
 43,
 25,
 43,
 38,
 39,
 32,
 58,
 50,
 58,
 69,
 24,
 58,
 28,
 24,
 62,
 57,
 42,
 64,
 50,
 75,
 26,
 52,
 41,
 81,
 31,
 28,
 68,
 70,
 62,
 31,
 38,
 70,
 73,
 31,
 62,
 51,
 29,
 55,
 46,
 64,
 43,
 35,
 58,
 69,
 72,
 46,
 60,
 67,
 55,
 27,
 31,
 64,
 36,
 67,
 56,
 31,
 49,
 36,
 29,
 37,
 69,
 41,
 52,
 66,
 53,
 56,
 31,
 58,
 58,
 64,
 45,
 60,
 51,
 83,
 52,
 34,
 44,
 37,
 48,
 57,
 32,
 75,
 61,
 34,
 52,
 44,
 57,
 46,
 80,
 32,
 62,
 61,
 47,
 58,
 52,
 68,
 39,
 74,
 46,
 44,
 68,
 68,
 31,
 34,
 67,
 58,
 39,
 58,
 61,
 59,
 36,
 59,
 43,
 77,
 48,
 81,
 51,
 53,
 56,
 63,
 55,
 54,
 55,
 72,
 53,
 27,
 31,
 38,
 40,
 48,
 50,
 34,
 48,
 57,
 30,
 28,
 36,
 60,
 58,
 49,
 40,
 63,
 50,
 38,
 47,
 63,
 42,
 54,
 63,
 29,
 36,
 33,
 72,
 29,
 53,
 79,
 58,
 69,
 69,
 75,
 41,
 35,
 46,
 65,
 55,
 55,
 51,
 25,
 44,
 86,
 56,
 55,
 64,
 53,
 49,
 55,
 80,
 38,
 44,
 40,
 35,
 69,
 64,
 61,
 46,
 49,
 64,
 40,
 27,
 60,
 52.295206666666665,
 50,
 64,
 66,
 53,
 62,
 63,
 50,
 61,
 42,
 63,
 73,
 72,
 52,
 45,
 44,
 63,
 37,
 38,
 36,
 41,
 63,
 76,
 43,
 64,
 67,
 52,
 66,
 55,
 42,
 28,
 47,
 61,
 73,
 58,
 68,
 41,
 44,
 42,
 40,
 78,
 81,
 49,
 58,
 54,
 33,
 53,
 23,
 49,
 44,
 63,
 42,
 65,
 54,
 28,
 78,
 66,
 32,
 43,
 59,
 76,
 68,
 61,
 68,
 57,
 68,
 57,
 46,
 69,
 44,
 28,
 53,
 40,
 36,
 42,
 64,
 57,
 62,
 47,
 62,
 49,
 66,
 54,
 56,
 83,
 45,
 29,
 67,
 29,
 46,
 25,
 68,
 33,
 55,
 46,
 66,
 66,
 48,
 52,
 32,
 51,
 27,
 32,
 30,
 37,
 49,
 37,
 36,
 62,
 57,
 59,
 38,
 52.295206666666665,
 58,
 37,
 63,
 32,
 52,
 43,
 29,
 45,
 38,
 62,
 29,
 41,
 29,
 47,
 55,
 87,
 41,
 61,
 28,
 43,
 71,
 46,
 30,
 43,
 59,
 47,
 61,
 72,
 50,
 68,
 58,
 69,
 71,
 48,
 37,
 50,
 38,
 35,
 59,
 67,
 30,
 56,
 51,
 40,
 28,
 54,
 46,
 57,
 35,
 79,
 46,
 79,
 64,
 74,
 64,
 63,
 63,
 54,
 63,
 45,
 22,
 66,
 57,
 43,
 61,
 54,
 90,
 60,
 34,
 38,
 60,
 76,
 76,
 39,
 83,
 63,
 52,
 58,
 42,
 63,
 58,
 62,
 46,
 27,
 27,
 35,
 55,
 73,
 51,
 36,
 56,
 59,
 49,
 64,
 56,
 37,
 48,
 36,
 70,
 49,
 57,
 39,
 61,
 38,
 24,
 33,
 73,
 79,
 81,
 55,
 57,
 58,
 52,
 51,
 63,
 52.295206666666665,
 56,
 41,
 43,
 56,
 59,
 40,
 74,
 49,
 68,
 57,
 48,
 40,
 23,
 52,
 48,
 55,
 61,
 33,
 69,
 39,
 84,
 32,
 78,
 61,
 37,
 33,
 81,
 60,
 46,
 57,
 57,
 32,
 42,
 38,
 82,
 33,
 68,
 52,
 46,
 66,
 57,
 50,
 34,
 53,
 76,
 64,
 72,
 48,
 71,
 59,
 53,
 34,
 51,
 55,
 46,
 27,
 43,
 75,
 31,
 43,
 51,
 28,
 51,
 55,
 69,
 42,
 33,
 60,
 54,
 48,
 49,
 73,
 91,
 66,
 49,
 46,
 37,
 54,
 22,
 31,
 39,
 56,
 44,
 50,
 55,
 52,
 58,
 62,
 42,
 47,
 42,
 39,
 45,
 82,
 64,
 55,
 58,
 35,
 62,
 52,
 40,
 57,
 45,
 27,
 24,
 37,
 36,
 40,
 86,
 56,
 59,
 40,
 49,
 66,
 63,
 45,
 30,
 36,
 39,
 27,
 44,
 34,
 59,
 38,
 38,
 39,
 43,
 57,
 75,
 29,
 55,
 60,
 37,
 69,
 48,
 89,
 56,
 75,
 55,
 63,
 49,
 31,
 61,
 46,
 40,
 91,
 46,
 56,
 29,
 45,
 49,
 64,
 51,
 45,
 78,
 36,
 54,
 67,
 63,
 61,
 86,
 50,
 51,
 70,
 44,
 39,
 49,
 58,
 39,
 33,
 53,
 36,
 62,
 62,
 29,
 44,
 32,
 52,
 69,
 71,
 63,
 44,
 54,
 41,
 32,
 87,
 43,
 47,
 75,
 64,
 41,
 30,
 40,
 56,
 55,
 36,
 57,
 56,
 49,
 53,
 51,
 64,
 28,
 54,
 53,
 37,
 70,
 48,
 52,
 36,
 51,
 69,
 52,
 67,
 36,
 49,
 51,
 36,
 33,
 62,
 69,
 70,
 49,
 46,
 65,
 56,
 51,
 57,
 48,
 47,
 33,
 75,
 40,
 40,
 68,
 35,
 65,
 81,
 52,
 69,
 34,
 40,
 35,
 80,
 68,
 47,
 64,
 36,
 49,
 55,
 35,
 63,
 64,
 64,
 26,
 77,
 32,
 54,
 40,
 53,
 58,
 50,
 56,
 75,
 36,
 33,
 52,
 62,
 60,
 52,
 36,
 71,
 83,
 76,
 40,
 46,
 78,
 36,
 76,
 41,
 60,
 63,
 52,
 58,
 56,
 33,
 64,
 33,
 40,
 56,
 60,
 27,
 41,
 48,
 69,
 39,
 36,
 34,
 50,
 79,
 54,
 36,
 39,
 35,
 38,
 52,
 24,
 76,
 41,
 74,
 47,
 57,
 63,
 62,
 39,
 51,
 60,
 68,
 63,
 55,
 58,
 55,
 33,
 58,
 55,
 44,
 57,
 65,
 56,
 25,
 89,
 32,
 48,
 38,
 58,
 59,
 32,
 50,
 26,
 55,
 82,
 62,
 46,
 66,
 46,
 59,
 29,
 39,
 36,
 71,
 82,
 26,
 58,
 68,
 42,
 58,
 26,
 33,
 51,
 60,
 29,
 79,
 27,
 50,
 31,
 45,
 26,
 63,
 40,
 57,
 54,
 33,
 54,
 24,
 41,
 47,
 76,
 87,
 34,
 46,
 82,
 48,
 83,
 86,
 56,
 90,
 73,
 33,
 85,
 45,
 56,
 31,
 30,
 32,
 40,
 79,
 37,
 63,
 28,
 69,
 47,
 71,
 56,
 40,
 64,
 59,
 28,
 45,
 31,
 58,
 44,
 37,
 68,
 47,
 57,
 51,
 50,
 74,
 73,
 81,
 58,
 87,
 55,
 50,
 34,
 67,
 63,
 62,
 59,
 32,
 64,
 72,
 64,
 30,
 79,
 81,
 41,
 58,
 55,
 45,
 44,
 49,
 74,
 26,
 59,
 39,
 48,
 49,
 47,
 28,
 42,
 53,
 68,
 31,
 34,
 56,
 30,
 52,
 54,
 39,
 57,
 60,
 45,
 33,
 64,
 85,
 34,
 37,
 63,
 35,
 32,
 73,
 39,
 38,
 40,
 42,
 36,
 64,
 51,
 25,
 36,
 34,
 73,
 55,
 39,
 83,
 63,
 52,
 59,
 24,
 44,
 36,
 74,
 45,
 41,
 63,
 41,
 45,
 54,
 76,
 65,
 38,
 47,
 68,
 34,
 46,
 55,
 34,
 29,
 35,
 43,
 37,
 74,
 55,
 72,
 80,
 50,
 77,
 73,
 52,
 48,
 44,
 50,
 44,
 70,
 53,
 70,
 36,
 63,
 50,
 48,
 62,
 53,
 43,
 58,
 67,
 59,
 55,
 82,
 47,
 43,
 58,
 41,
 63,
 50,
 32,
 37,
 36,
 45,
 26,
 80,
 40,
 35,
 52,
 38,
 78,
 57,
 57,
 32,
 43,
 60,
 54,
 59,
 ...]



In [22]:

    
data.age = ageNew



In [23]:

    
"""RevolvingUtilizationOfUnsecuredLines = Total balance on credit cards and personal lines of credit 
except real estate and no installment debt like car loans divided by the sum of credit limits"""









    Out[23]:





'RevolvingUtilizationOfUnsecuredLines = Total balance on credit cards and personal lines of credit \nexcept real estate and no installment debt like car loans divided by the sum of credit limits'



In [24]:

    
data.RevolvingUtilizationOfUnsecuredLines.describe()









    Out[24]:





count    150000.000000
mean          6.048438
std         249.755371
min           0.000000
25%           0.029867
50%           0.154181
75%           0.559046
max       50708.000000
Name: RevolvingUtilizationOfUnsecuredLines, dtype: float64



In [25]:

    
len(data[data.RevolvingUtilizationOfUnsecuredLines >1])









    Out[25]:





3321



In [26]:

    
"""the value here should be between 0-1 [implies 0 to 100%], but few values are more than 1 [implying more than 100%], so all those values must be a data entry error and should be changed to the value/100"""









    Out[26]:





'the value here should be between 0-1 [implies 0 to 100%], but few values are more than 1 [implying more than 100%], so all those values must be a data entry error and should be changed to the value/100'



In [27]:

    
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val >1:
        data['RUUL_indicator']=1
    else:
        data['RUUL_indicator']=0



In [28]:

    
RUULNew=[]

for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val <=10:
        RUULNew.append(val/10)
    elif val >10 and val <=100:
        RUULNew.append(val/100)
    elif val >100 and val <=1000:
        RUULNew.append(val/1000)
    elif val >1000 and val <=10000:
        RUULNew.append(val/10000)
    elif val >10000 and val <=100000:   
        RUULNew.append(val/100000)
    else:
        RUULNew.append(val)



In [29]:

    
data.RevolvingUtilizationOfUnsecuredLines = RUULNew



In [30]:

    
"""NumberOfTime3059DaysPastDueNotWorse"""









    Out[30]:





'NumberOfTime3059DaysPastDueNotWorse'



In [31]:

    
data.NumberOfTime3059DaysPastDueNotWorse.describe()









    Out[31]:





count    150000.000000
mean          0.421033
std           4.192781
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime3059DaysPastDueNotWorse, dtype: float64



In [32]:

    
for i in range(0,100):
    print i, len(data[data.NumberOfTime3059DaysPastDueNotWorse == i])



In [33]:

    
"""looks 96 and 98 are outliers"""









    Out[33]:





'looks 96 and 98 are outliers'



In [34]:

    
New = []
meanNOTT = data.NumberOfTime3059DaysPastDueNotWorse.mean()
for val in data.NumberOfTime3059DaysPastDueNotWorse:
    if ((val == 98) | (val == 96)):
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfTime3059DaysPastDueNotWorse = New



In [36]:

    
"""DebtRatio"""









    Out[36]:





'DebtRatio'



In [37]:

    
data.DebtRatio.describe()









    Out[37]:





count    150000.000000
mean        353.005076
std        2037.818523
min           0.000000
25%           0.175074
50%           0.366508
75%           0.868254
max      329664.000000
Name: DebtRatio, dtype: float64



In [38]:

    
len(data[data.DebtRatio > 1])









    Out[38]:





35137



In [39]:

    
len(data[data.DebtRatio >0])









    Out[39]:





145887



In [40]:

    
New = []
medianNOTT = data.DebtRatio.median()
for val in data.NumberRealEstateLoansOrLines:
    if val>1:
        New.append(medianNOTT)
    else:
        New.append(val)

data.DebtRatio = New



In [ ]:

    
"""NumberOfOpenCreditLinesAndLoans"""



In [41]:

    
data.NumberOfOpenCreditLinesAndLoans.describe()









    Out[41]:





count    150000.000000
mean          8.452760
std           5.145951
min           0.000000
25%           5.000000
50%           8.000000
75%          11.000000
max          58.000000
Name: NumberOfOpenCreditLinesAndLoans, dtype: float64



In [42]:

    
data['NumberOfOpenCreditLinesAndLoans'].hist(bins=100)









    Out[42]:





<matplotlib.axes._subplots.AxesSubplot at 0xbed5ba8>



In [ ]:

    
"""NumberOfTimes90DaysLate"""



In [43]:

    
data.NumberOfTimes90DaysLate.describe()









    Out[43]:





count    150000.000000
mean          0.265973
std           4.169304
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTimes90DaysLate, dtype: float64



In [44]:

    
for i in range(0,100):
    print i, len(data[data.NumberOfTimes90DaysLate == i])



In [45]:

    
New = []
meanNOTT = data.NumberOfTimes90DaysLate.mean()
for val in data.NumberOfTimes90DaysLate:
    if ((val == 98) | (val == 96)):
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfTimes90DaysLate = New



In [46]:

    
"""NumberRealEstateLoansOrLines"""









    Out[46]:





'NumberRealEstateLoansOrLines'



In [47]:

    
data.NumberRealEstateLoansOrLines.describe()









    Out[47]:





count    150000.000000
mean          1.018240
std           1.129771
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          54.000000
Name: NumberRealEstateLoansOrLines, dtype: float64



In [48]:

    
for i in range(0,55):
    print i, len(data[data.NumberRealEstateLoansOrLines == i])



In [49]:

    
New = []
meanNOTT = data.NumberRealEstateLoansOrLines.mean()
for val in data.NumberRealEstateLoansOrLines:
    if val>50:
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberRealEstateLoansOrLines = New



In [50]:

    
data.NumberRealEstateLoansOrLines.describe()









    Out[50]:





count    150000.000000
mean          1.017887
std           1.121458
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          32.000000
Name: NumberRealEstateLoansOrLines, dtype: float64



In [51]:

    
"""NumberOfTime6089DaysPastDueNotWorse"""









    Out[51]:





'NumberOfTime6089DaysPastDueNotWorse'



In [52]:

    
data.NumberOfTime6089DaysPastDueNotWorse.describe()









    Out[52]:





count    150000.000000
mean          0.240387
std           4.155179
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime6089DaysPastDueNotWorse, dtype: float64



In [53]:

    
New = []
meanNOTT = data.NumberOfTime6089DaysPastDueNotWorse.mean()
for val in data.NumberOfTime6089DaysPastDueNotWorse:
    if ((val == 98) | (val == 96)):
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfTime6089DaysPastDueNotWorse = New



In [54]:

    
data.NumberOfTime6089DaysPastDueNotWorse.describe()









    Out[54]:





count    150000.000000
mean          0.065138
std           0.329861
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          11.000000
Name: NumberOfTime6089DaysPastDueNotWorse, dtype: float64



In [55]:

    
"""NumberOfDependents"""









    Out[55]:





'NumberOfDependents'



In [56]:

    
data.NumberOfDependents.describe()









    Out[56]:





count    146076.000000
mean          0.757222
std           1.115086
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          20.000000
Name: NumberOfDependents, dtype: float64



In [57]:

    
for i in range(0,25):
    print i, len(data[data.NumberOfDependents == i])



In [58]:

    
"""having more than 10 dependents looks weird"""









    Out[58]:





'having more than 10 dependents looks weird'



In [59]:

    
New = []
meanNOTT = data.NumberOfDependents.mean()
for val in data.NumberOfDependents:
    if val>10:
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfDependents = New



In [60]:

    
data.NumberOfDependents.isnull().sum()









    Out[60]:





3924



In [61]:

    
data['NumberOfDependents'] = data['NumberOfDependents'].fillna(0)



In [62]:

    
data.NumberOfDependents.describe()









    Out[62]:





count    150000.000000
mean          0.737203
std           1.105450
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          10.000000
Name: NumberOfDependents, dtype: float64



In [63]:

    
"""MonthlyIncome"""









    Out[63]:





'MonthlyIncome'



In [64]:

    
data.MonthlyIncome.describe()









    Out[64]:





count    1.202690e+05
mean     6.670221e+03
std      1.438467e+04
min      0.000000e+00
25%      3.400000e+03
50%      5.400000e+03
75%      8.249000e+03
max      3.008750e+06
Name: MonthlyIncome, dtype: float64



In [65]:

    
train = data[data.MonthlyIncome.isnull() == False]
test = data[data.MonthlyIncome.isnull() == True]



In [66]:

    
train.shape, test.shape









    Out[66]:





((120269, 12), (29731, 12))



In [67]:

    
X_train = train.drop(['MonthlyIncome', 'SeriousDlqin2yrs'], axis=1)
y_train = train.MonthlyIncome
X_test = test.drop(['MonthlyIncome', 'SeriousDlqin2yrs'], axis=1)



In [68]:

    
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error



In [69]:

    
lmMod = LinearRegression(fit_intercept=True, normalize=True).fit(X_train, y_train)



In [70]:

    
lmMod.coef_









    Out[70]:





array([-3948.4507996 ,    32.0281422 ,  -103.40827549,   -72.3226233 ,
         108.72297805,  -145.64343546,  1261.74822934,  -153.67162041,
         720.53837969,     0.        ])



In [71]:

    
pred = lmMod.predict(X_test)



In [72]:

    
predNoZero = []
for val in pred:
    if val >= 0:
        predNoZero.append(val)
    else:
        predNoZero.append(0.)



In [73]:

    
testFull = data[data.MonthlyIncome.isnull() == True]



In [74]:

    
testFull['MonthlyIncome'] = predNoZero









    



C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [76]:

    
monNew = []
for index in data.index:
    if data.MonthlyIncome[index].is_integer() == True:
        monNew.append(data.MonthlyIncome[index])
    else:
        monNew.append(testFull.MonthlyIncome[index])



In [77]:

    
testFull.MonthlyIncome.isnull().sum()









    Out[77]:





0



In [78]:

    
data.MonthlyIncome = monNew



In [79]:

    
"""FEATURE ENGINEERING"""









    Out[79]:





'FEATURE ENGINEERING'



In [80]:

    
data.MonthlyIncome.describe()









    Out[80]:





count    1.500000e+05
mean     6.546643e+03
std      1.290915e+04
min      0.000000e+00
25%      3.734000e+03
50%      5.524000e+03
75%      7.869294e+03
max      3.008750e+06
Name: MonthlyIncome, dtype: float64



In [81]:

    
"""No Income Variable Indicator"""









    Out[81]:





'No Income Variable Indicator'



In [82]:

    
for val in data.MonthlyIncome:
    if val <=0:
        data['NoIncome_MI_indicator']=1
    else:
        data['NoIncome_MI_indicator']=0



In [83]:

    
"""Zero Debt Ratio Indicator"""









    Out[83]:





'Zero Debt Ratio Indicator'



In [84]:

    
for val in data.DebtRatio:
    if val <=0:
        data['No_DebtRatio_indicator']=1
    else:
        data['No_DebtRatio_indicator']=0



In [85]:

    
"""Monthly Income is Zero, But Debt Ratio is non-zero = 1"""









    Out[85]:





'Monthly Income is Zero, But Debt Ratio is non-zero = 1'



In [86]:

    
data['MIz_DRnz']=map(lambda x,y: 1 if (x==0 and y>0) else 0, data['MonthlyIncome'], data['DebtRatio'])



In [87]:

    
"""Monthly Income is Zero, But Debt Ratio is zero = 1"""









    Out[87]:





'Monthly Income is Zero, But Debt Ratio is zero = 1'



In [88]:

    
data['MIz_DRz']=map(lambda x,y: 1 if (x==0 and y==0) else 0, data['MonthlyIncome'], data['DebtRatio'])



In [89]:

    
"""Monthly Income is Non-Zero, But Debt Ratio is zero = 1"""









    Out[89]:





'Monthly Income is Non-Zero, But Debt Ratio is zero = 1'



In [90]:

    
data['MInz_DRz']=map(lambda x,y: 1 if (x>0 and y==0) else 0, data['MonthlyIncome'], data['DebtRatio'])



In [91]:

    
"""Zero Revolving Utilization when Revolving Utilization Of Unsecured Lines == 0"""









    Out[91]:





'Zero Revolving Utilization when Revolving Utilization Of Unsecured Lines == 0'



In [92]:

    
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val <=0:
        data['ZeroRevolvingUtilization']=1
    else:
        data['ZeroRevolvingUtilization']=0



In [93]:

    
"""debtRatio * Monthly Income = DR_MI"""









    Out[93]:





'debtRatio * Monthly Income = DR_MI'



In [94]:

    
#All 0 MI to 1 so that DR dont become 0 
for val in data.MonthlyIncome:
    if val ==0:
        MIZ=1
    else:
        MIZ=val



In [95]:

    
data['DR_MI']=MIZ*data['DebtRatio']



In [96]:

    
data.DR_MI.describe()









    Out[96]:





count    150000.000000
mean       3673.196399
std        3493.021586
min           0.000000
25%           0.000000
50%        2989.970967
75%        8158.000000
max        8158.000000
Name: DR_MI, dtype: float64



In [97]:

    
from math import *



In [98]:

    
data['Log_DR_MI']=np.log(data.DR_MI)









    



C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: RuntimeWarning: divide by zero encountered in log
  if __name__ == '__main__':



In [99]:

    
"""Log of DebtRatio*MonthlyIncome"""









    Out[99]:





'Log of DebtRatio*MonthlyIncome'



In [101]:

    
data_new2=data



In [102]:

    
data['Log_DR_MI']=data.Log_DR_MI.replace([np.inf, -np.inf], 0)



In [103]:

    
"""  RevolvingLines = NumberOfOpenCreditLinesAndLoans - NumberRealEstateLoansOrLines
"""









    Out[103]:





'  RevolvingLines = NumberOfOpenCreditLinesAndLoans - NumberRealEstateLoansOrLines\n'



In [104]:

    
data['RevolvingLines']=data['NumberOfOpenCreditLinesAndLoans']-data['NumberRealEstateLoansOrLines']



In [105]:

    
"""HasRealEstateLoans = NumberRealEstateLoansOrLines > 0)"""









    Out[105]:





'HasRealEstateLoans = NumberRealEstateLoansOrLines > 0)'



In [106]:

    
HRSL=[]
for val in data.NumberRealEstateLoansOrLines:
    if val >0:
        HRSL.append(1)
    else:
        HRSL.append(0)
        
data['HasRealEstateLoans']=HRSL



In [107]:

    
"""HasMultipleRealEstateLoans = NumberRealEstateLoansOrLines > 2"""









    Out[107]:





'HasMultipleRealEstateLoans = NumberRealEstateLoansOrLines > 2'



In [108]:

    
MHRSL=[]
for val in data.NumberRealEstateLoansOrLines:
    if val >2:
        MHRSL.append(1)
    else:
        MHRSL.append(0)
        
data['HasMultipleRealEstateLoans ']=MHRSL



In [109]:

    
"""DisposableIncome = (1 - DebtRatio) * MonthlyIncome"""









    Out[109]:





'DisposableIncome = (1 - DebtRatio) * MonthlyIncome'



In [110]:

    
data['DisposableIncome']=(1-data['DebtRatio'])*data['MonthlyIncome']



In [111]:

    
"""RevolvingToRealEstate  = RevolvingLines / (1 + NumberRealEstateLoansOrLines)"""









    Out[111]:





'RevolvingToRealEstate  = RevolvingLines / (1 + NumberRealEstateLoansOrLines)'



In [112]:

    
data['RevolvingToRealEstate']=data['RevolvingLines'] / (1+data['NumberRealEstateLoansOrLines'])



In [113]:

    
"""FullUtilization = RevolvingUtilizationOfUnsecuredLines == 1)
  ExcessUtilization = RevolvingUtilizationOfUnsecuredLines > 1)"""









    Out[113]:





'FullUtilization = RevolvingUtilizationOfUnsecuredLines == 1)\n  ExcessUtilization = RevolvingUtilizationOfUnsecuredLines > 1)'



In [114]:

    
FU=[]
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val ==1:
        FU.append(1)
    else:
        FU.append(0)
        
data['FullUtilization']=FU



In [115]:

    
EU=[]
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val >1:
        EU.append(1)
    else:
        EU.append(0)
        
data['ExcessUtilization']=EU



In [116]:

    
"""
  RevolvingLinesPerPerson = RevolvingLines / (1 + NumberOfDependents)
  RealEstateLoansPerPerson = NumberRealEstateLoansOrLines / (1 + NumberOfDependents)
  IncomePerDependent = 1+NumberOfDependents/MonthlyIncome+1
  """









    Out[116]:





'\n  RevolvingLinesPerPerson = RevolvingLines / (1 + NumberOfDependents)\n  RealEstateLoansPerPerson = NumberRealEstateLoansOrLines / (1 + NumberOfDependents)\n  IncomePerDependent = 1+NumberOfDependents/MonthlyIncome+1\n  '



In [117]:

    
data['RevolvingLinesPerPerson'] = data['RevolvingLines'] / (1+data['NumberOfDependents'])
data['RealEstateLoanPerPerson'] = data['NumberRealEstateLoansOrLines'] / (1+data['NumberOfDependents'])



In [118]:

    
data['IncomePerDependent']=(1+data['NumberOfDependents']) / (1+data['MonthlyIncome'])



In [119]:

    
"""NumberOfTimePastDue"""









    Out[119]:





'NumberOfTimePastDue'



In [120]:

    
data['NumberOfTimePastDue']=data['NumberOfTime3059DaysPastDueNotWorse']+data['NumberOfTime6089DaysPastDueNotWorse']+data['NumberOfTimes90DaysLate']



In [121]:

    
"""DelinquenciesPerLine  = NumberOfTimesPastDue / NumberOfOpenCreditLinesAndLoans"""









    Out[121]:





'DelinquenciesPerLine  = NumberOfTimesPastDue / NumberOfOpenCreditLinesAndLoans'



In [122]:

    
data['DelinquenciesPerLine']=data['NumberOfTimePastDue'] /data['NumberOfOpenCreditLinesAndLoans']



In [123]:

    
data_new3=data



In [124]:

    
data['DelinquenciesPerLine']=data.DelinquenciesPerLine.replace([np.inf, -np.inf], np.NaN)
data.DelinquenciesPerLine[np.isnan(data.DelinquenciesPerLine)] = 0
data.DelinquenciesPerLine[np.isinf(data.DelinquenciesPerLine)] = 0
data['DelinquenciesPerLine']=data['DelinquenciesPerLine'].fillna(0)









    



C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [125]:

    
data['DelinquenciesPerLine']=data.DelinquenciesPerLine.replace([np.inf, -np.inf], 0)



In [126]:

    
data.DelinquenciesPerLine.isnull().sum()









    Out[126]:





0



In [127]:

    
"""DelinquenciesPerRevolvingLine  = NumberOfTimesPastDue / RevolvingLines"""









    Out[127]:





'DelinquenciesPerRevolvingLine  = NumberOfTimesPastDue / RevolvingLines'



In [128]:

    
data['DelinquenciesPerRevolvingLine'] = data['NumberOfTimePastDue'] / data['RevolvingLines']



In [131]:

    
data['DelinquenciesPerRevolvingLine']=data.DelinquenciesPerRevolvingLine.replace([np.inf, -np.inf], np.NaN)
data.DelinquenciesPerRevolvingLine[np.isnan(data.DelinquenciesPerRevolvingLine)] = 0
data.DelinquenciesPerRevolvingLine[np.isinf(data.DelinquenciesPerRevolvingLine)] = 0
data['DelinquenciesPerRevolvingLine']=data['DelinquenciesPerRevolvingLine'].fillna(0)









    



C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()



In [132]:

    
from sklearn.cross_validation import train_test_split









    



C:\ProgramData\Anaconda2\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [133]:

    
data_new4=data



In [134]:

    
X = data.drop('SeriousDlqin2yrs', axis=1)
y = data.SeriousDlqin2yrs



In [135]:

    
#np.savetxt("C:/Users/saga54/Desktop/foo.csv", data, delimiter=",")



In [137]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)



In [144]:

    
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingClassifier



In [145]:

    
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)



In [146]:

    
y_pred = clf.fit(X_train, y_train).predict(X_test)



In [147]:

    
from sklearn.metrics import confusion_matrix



In [148]:

    
confusion_matrix(y_test, y_pred)









    Out[148]:





array([[33865,  1185],
       [ 1608,   842]])



In [149]:

    
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)









    Out[149]:





0.92552000000000001



In [150]:

    
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))









    



             precision    recall  f1-score   support

    class 0       0.95      0.97      0.96     35050
    class 1       0.42      0.34      0.38      2450

avg / total       0.92      0.93      0.92     37500



In [151]:

    
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)









    Out[151]:





0.65493231243995464



In [152]:

    
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)









    Out[152]:





0.65493231243995464



In [153]:

    
""""colsample_bytree": 0.41,
      "gamma": 0.643,
      "max_depth": 5,
      "max_delta_step": 1.78,
      "min_child_weight": 10.0,
      "objective": "binary:logistic",
      "subsample": 0.801,
      "learning_rate": 0.027,
      "silent": false,
      "nthread": 7,
      "n_estimators": 295,
      "seed": 2"""









    Out[153]:





'"colsample_bytree": 0.41,\n      "gamma": 0.643,\n      "max_depth": 5,\n      "max_delta_step": 1.78,\n      "min_child_weight": 10.0,\n      "objective": "binary:logistic",\n      "subsample": 0.801,\n      "learning_rate": 0.027,\n      "silent": false,\n      "nthread": 7,\n      "n_estimators": 295,\n      "seed": 2'



In [155]:

    
data.to_csv("C:/Users/saga54/Desktop/gmc.csv")



In [ ]:

    
"""RANDOM FOREST CLASSIFIER"""



In [200]:

    
from sklearn.ensemble import RandomForestClassifier
clfRF = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)



In [201]:

    
y_pred = clfRF.fit(X_train, y_train).predict(X_test)



In [202]:

    
confusion_matrix(y_test, y_pred)









    Out[202]:





array([[34607,   443],
       [ 1985,   465]])



In [203]:

    
accuracy_score(y_test, y_pred)









    Out[203]:





0.93525333333333338



In [204]:

    
clfRF.feature_importances_









    Out[204]:





array([ 0.14185544,  0.10717513,  0.01953287,  0.00309399,  0.11670124,
        0.03154686,  0.048104  ,  0.00772096,  0.0234183 ,  0.02028825,
        0.        ,  0.        ,  0.        ,  0.000473  ,  0.00032052,
        0.00160409,  0.        ,  0.00288444,  0.00298045,  0.02959041,
        0.00145809,  0.00195608,  0.08125582,  0.03305014,  0.        ,
        0.        ,  0.04334402,  0.01604187,  0.11711582,  0.05117611,
        0.05440517,  0.04290692])



In [205]:

    
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))









    



             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.51      0.19      0.28      2450

avg / total       0.92      0.94      0.92     37500



In [206]:

    
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)









    Out[206]:





0.58857841567440106



In [207]:

    
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)









    Out[207]:





0.58857841567440106



In [ ]:

    
"""ADA BOOSTING CLASSIFIER"""



In [208]:

    
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clfADA = AdaBoostClassifier(n_estimators=100)



In [209]:

    
y_pred = clfADA.fit(X_train, y_train).predict(X_test)



In [210]:

    
confusion_matrix(y_test, y_pred)









    Out[210]:





array([[34626,   424],
       [ 1914,   536]])



In [211]:

    
accuracy_score(y_test, y_pred)









    Out[211]:





0.93765333333333334



In [212]:

    
clfADA.feature_importances_









    Out[212]:





array([ 0.22,  0.11,  0.01,  0.  ,  0.09,  0.04,  0.02,  0.05,  0.02,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.01,  0.  ,  0.  ,  0.  ,
        0.  ,  0.02,  0.  ,  0.01,  0.06,  0.03,  0.  ,  0.  ,  0.06,
        0.02,  0.07,  0.11,  0.04,  0.01])



In [214]:

    
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))









    



             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.56      0.22      0.31      2450

avg / total       0.92      0.94      0.92     37500



In [215]:

    
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)









    Out[215]:





0.60333925296224056



In [216]:

    
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)









    Out[216]:





0.60333925296224056



In [ ]:

    
"""ENSEMBLE 1"""



In [218]:

    
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier


clf1 = GradientBoostingClassifier(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')



In [219]:

    
for clf, label in zip([clf1, clf2, clf3, eclf], ['Gradient Boosting Classifier', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))









    



Accuracy: 0.94 (+/- 0.00) [Gradient Boosting Classifier]
Accuracy: 0.93 (+/- 0.00) [Random Forest]
Accuracy: 0.92 (+/- 0.00) [naive Bayes]
Accuracy: 0.93 (+/- 0.00) [Ensemble]



In [220]:

    
"""Ensemble 2"""









    Out[220]:





'Ensemble 2'



In [221]:

    
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=1)
clf2 = RandomForestClassifier(n_estimators=100, random_state=1)
clf3 = AdaBoostClassifier(n_estimators=100)
X = X_train
y = y_train
eclf1 = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(X, y)

y_pred1=eclf1.predict(X_test)


eclf2 = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft')
eclf2 = eclf2.fit(X, y)

y_pred2=eclf2.predict(X_test)

eclf3 = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[2,1,1])
eclf3 = eclf3.fit(X, y)
y_pred3=eclf3.predict(X_test)



In [222]:

    
confusion_matrix(y_test, y_pred1)

accuracy_score(y_test, y_pred1)









    Out[222]:





0.93773333333333331



In [223]:

    
confusion_matrix(y_test, y_pred2)

accuracy_score(y_test, y_pred2)









    Out[223]:





0.92554666666666663



In [224]:

    
confusion_matrix(y_test, y_pred3)

accuracy_score(y_test, y_pred3)









    Out[224]:





0.92552000000000001



In [225]:

    
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred1, target_names=target_names))









    



             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.55      0.24      0.34      2450

avg / total       0.92      0.94      0.93     37500



In [226]:

    
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred1)









    Out[226]:





0.61363212902850151



In [227]:

    
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred1)
metrics.auc(fpr, tpr)









    Out[227]:





0.61363212902850151



In [228]:

    
"""Ensemble 3"""









    Out[228]:





'Ensemble 3'



In [229]:

    
from sklearn.model_selection import GridSearchCV
clf1 = GradientBoostingClassifier(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = AdaBoostClassifier()
eclf = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('ab', clf3)], voting='soft')

params = {'rf__n_estimators': [20, 200],}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X_train, y_train)



In [230]:

    
y_pred=grid.predict(X_test)



In [231]:

    
confusion_matrix(y_test, y_pred)









    Out[231]:





array([[34734,   316],
       [ 1985,   465]])



In [232]:

    
accuracy_score(y_test, y_pred)









    Out[232]:





0.93864000000000003



In [233]:

    
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))









    



             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.60      0.19      0.29      2450

avg / total       0.92      0.94      0.92     37500



In [234]:

    
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)









    Out[234]:





0.59039011324929402



In [235]:

    
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)









    Out[235]:





0.59039011324929402



In [ ]:

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfTime60-89DaysPastDueNotWorse	NumberOfDependents
count	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	1.202690e+05	150000.000000	150000.000000	150000.000000	150000.000000	146076.000000
mean	75000.500000	0.066840	6.048438	52.295207	0.421033	353.005076	6.670221e+03	8.452760	0.265973	1.018240	0.240387	0.757222
std	43301.414527	0.249746	249.755371	14.771866	4.192781	2037.818523	1.438467e+04	5.145951	4.169304	1.129771	4.155179	1.115086
min	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000
25%	37500.750000	0.000000	0.029867	41.000000	0.000000	0.175074	3.400000e+03	5.000000	0.000000	0.000000	0.000000	0.000000
50%	75000.500000	0.000000	0.154181	52.000000	0.000000	0.366508	5.400000e+03	8.000000	0.000000	1.000000	0.000000	0.000000
75%	112500.250000	0.000000	0.559046	63.000000	0.000000	0.868254	8.249000e+03	11.000000	0.000000	2.000000	0.000000	1.000000
max	150000.000000	1.000000	50708.000000	109.000000	98.000000	329664.000000	3.008750e+06	58.000000	98.000000	54.000000	98.000000	20.000000

	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime3059DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfDependents
0	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
1	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
2	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
3	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
4	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0