In [15]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
data = pd.read_csv("cs-training.csv")

In [17]:
data.describe()


Out[17]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
count 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 1.202690e+05 150000.000000 150000.000000 150000.000000 150000.000000 146076.000000
mean 75000.500000 0.066840 6.048438 52.295207 0.421033 353.005076 6.670221e+03 8.452760 0.265973 1.018240 0.240387 0.757222
std 43301.414527 0.249746 249.755371 14.771866 4.192781 2037.818523 1.438467e+04 5.145951 4.169304 1.129771 4.155179 1.115086
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 37500.750000 0.000000 0.029867 41.000000 0.000000 0.175074 3.400000e+03 5.000000 0.000000 0.000000 0.000000 0.000000
50% 75000.500000 0.000000 0.154181 52.000000 0.000000 0.366508 5.400000e+03 8.000000 0.000000 1.000000 0.000000 0.000000
75% 112500.250000 0.000000 0.559046 63.000000 0.000000 0.868254 8.249000e+03 11.000000 0.000000 2.000000 0.000000 1.000000
max 150000.000000 1.000000 50708.000000 109.000000 98.000000 329664.000000 3.008750e+06 58.000000 98.000000 54.000000 98.000000 20.000000

In [18]:
data=data.drop('Unnamed: 0', axis = 1)

In [19]:
data.describe()


Out[19]:
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
count 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 1.202690e+05 150000.000000 150000.000000 150000.000000 150000.000000 146076.000000
mean 0.066840 6.048438 52.295207 0.421033 353.005076 6.670221e+03 8.452760 0.265973 1.018240 0.240387 0.757222
std 0.249746 249.755371 14.771866 4.192781 2037.818523 1.438467e+04 5.145951 4.169304 1.129771 4.155179 1.115086
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.029867 41.000000 0.000000 0.175074 3.400000e+03 5.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.154181 52.000000 0.000000 0.366508 5.400000e+03 8.000000 0.000000 1.000000 0.000000 0.000000
75% 0.000000 0.559046 63.000000 0.000000 0.868254 8.249000e+03 11.000000 0.000000 2.000000 0.000000 1.000000
max 1.000000 50708.000000 109.000000 98.000000 329664.000000 3.008750e+06 58.000000 98.000000 54.000000 98.000000 20.000000

In [20]:
data.columns


Out[20]:
Index([u'SeriousDlqin2yrs', u'RevolvingUtilizationOfUnsecuredLines', u'age',
       u'NumberOfTime30-59DaysPastDueNotWorse', u'DebtRatio', u'MonthlyIncome',
       u'NumberOfOpenCreditLinesAndLoans', u'NumberOfTimes90DaysLate',
       u'NumberRealEstateLoansOrLines',
       u'NumberOfTime60-89DaysPastDueNotWorse', u'NumberOfDependents'],
      dtype='object')

In [21]:
cleanCol = []
for i in range(len(data.columns)):
    cleanCol.append(data.columns[i].replace('-', ''))

In [22]:
cleanCol


Out[22]:
['SeriousDlqin2yrs',
 'RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime3059DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime6089DaysPastDueNotWorse',
 'NumberOfDependents']

In [23]:
data.columns = cleanCol

In [24]:
data.describe()


Out[24]:
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime3059DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime6089DaysPastDueNotWorse NumberOfDependents
count 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 1.202690e+05 150000.000000 150000.000000 150000.000000 150000.000000 146076.000000
mean 0.066840 6.048438 52.295207 0.421033 353.005076 6.670221e+03 8.452760 0.265973 1.018240 0.240387 0.757222
std 0.249746 249.755371 14.771866 4.192781 2037.818523 1.438467e+04 5.145951 4.169304 1.129771 4.155179 1.115086
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.029867 41.000000 0.000000 0.175074 3.400000e+03 5.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.154181 52.000000 0.000000 0.366508 5.400000e+03 8.000000 0.000000 1.000000 0.000000 0.000000
75% 0.000000 0.559046 63.000000 0.000000 0.868254 8.249000e+03 11.000000 0.000000 2.000000 0.000000 1.000000
max 1.000000 50708.000000 109.000000 98.000000 329664.000000 3.008750e+06 58.000000 98.000000 54.000000 98.000000 20.000000

In [25]:
data.head(5)


Out[25]:
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime3059DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime6089DaysPastDueNotWorse NumberOfDependents
0 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0

In [26]:
data.isnull().sum()


Out[26]:
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime3059DaysPastDueNotWorse         0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime6089DaysPastDueNotWorse         0
NumberOfDependents                       3924
dtype: int64

In [27]:
data['age'].hist(bins=100)


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6dfbcf3b10>
/opt/conda/lib/python2.7/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [28]:
data.age.describe()


Out[28]:
count    150000.000000
mean         52.295207
std          14.771866
min           0.000000
25%          41.000000
50%          52.000000
75%          63.000000
max         109.000000
Name: age, dtype: float64

In [29]:
for i in range(0,110):
    print i, len(data[data.age == i])


0 1
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 183
22 434
23 641
24 816
25 953
26 1193
27 1338
28 1560
29 1702
30 1937
31 2038
32 2050
33 2239
34 2155
35 2246
36 2379
37 2521
38 2631
39 2987
40 3093
41 3122
42 3082
43 3208
44 3294
45 3502
46 3714
47 3719
48 3806
49 3837
50 3753
51 3627
52 3609
53 3648
54 3561
55 3416
56 3589
57 3375
58 3443
59 3280
60 3258
61 3522
62 3568
63 3719
64 3058
65 2594
66 2494
67 2503
68 2235
69 1954
70 1777
71 1646
72 1649
73 1520
74 1451
75 1241
76 1183
77 1099
78 1054
79 981
80 876
81 774
82 647
83 512
84 480
85 483
86 407
87 357
88 313
89 276
90 198
91 154
92 93
93 87
94 47
95 45
96 18
97 17
98 6
99 9
100 0
101 3
102 3
103 3
104 0
105 1
106 0
107 1
108 0
109 2

In [30]:
"""age should be between a defined range, 0-109 makes less sense, should be between 22-91"""


Out[30]:
'age should be between a defined range, 0-109 makes less sense, should be between 22-91'

In [31]:
np.median(data.age)
np.mean(data.age)


Out[31]:
52.295206666666665

In [20]:
mean_age=np.mean(data.age)
ageNew=[]
for val in data.age:
    if val < 22 or val >91:
        ageNew.append(mean_age)
    else:
        ageNew.append(val)

In [21]:
ageNew


Out[21]:
[45,
 40,
 38,
 30,
 49,
 74,
 57,
 39,
 27,
 57,
 30,
 51,
 46,
 40,
 76,
 64,
 78,
 53,
 43,
 25,
 43,
 38,
 39,
 32,
 58,
 50,
 58,
 69,
 24,
 58,
 28,
 24,
 62,
 57,
 42,
 64,
 50,
 75,
 26,
 52,
 41,
 81,
 31,
 28,
 68,
 70,
 62,
 31,
 38,
 70,
 73,
 31,
 62,
 51,
 29,
 55,
 46,
 64,
 43,
 35,
 58,
 69,
 72,
 46,
 60,
 67,
 55,
 27,
 31,
 64,
 36,
 67,
 56,
 31,
 49,
 36,
 29,
 37,
 69,
 41,
 52,
 66,
 53,
 56,
 31,
 58,
 58,
 64,
 45,
 60,
 51,
 83,
 52,
 34,
 44,
 37,
 48,
 57,
 32,
 75,
 61,
 34,
 52,
 44,
 57,
 46,
 80,
 32,
 62,
 61,
 47,
 58,
 52,
 68,
 39,
 74,
 46,
 44,
 68,
 68,
 31,
 34,
 67,
 58,
 39,
 58,
 61,
 59,
 36,
 59,
 43,
 77,
 48,
 81,
 51,
 53,
 56,
 63,
 55,
 54,
 55,
 72,
 53,
 27,
 31,
 38,
 40,
 48,
 50,
 34,
 48,
 57,
 30,
 28,
 36,
 60,
 58,
 49,
 40,
 63,
 50,
 38,
 47,
 63,
 42,
 54,
 63,
 29,
 36,
 33,
 72,
 29,
 53,
 79,
 58,
 69,
 69,
 75,
 41,
 35,
 46,
 65,
 55,
 55,
 51,
 25,
 44,
 86,
 56,
 55,
 64,
 53,
 49,
 55,
 80,
 38,
 44,
 40,
 35,
 69,
 64,
 61,
 46,
 49,
 64,
 40,
 27,
 60,
 52.295206666666665,
 50,
 64,
 66,
 53,
 62,
 63,
 50,
 61,
 42,
 63,
 73,
 72,
 52,
 45,
 44,
 63,
 37,
 38,
 36,
 41,
 63,
 76,
 43,
 64,
 67,
 52,
 66,
 55,
 42,
 28,
 47,
 61,
 73,
 58,
 68,
 41,
 44,
 42,
 40,
 78,
 81,
 49,
 58,
 54,
 33,
 53,
 23,
 49,
 44,
 63,
 42,
 65,
 54,
 28,
 78,
 66,
 32,
 43,
 59,
 76,
 68,
 61,
 68,
 57,
 68,
 57,
 46,
 69,
 44,
 28,
 53,
 40,
 36,
 42,
 64,
 57,
 62,
 47,
 62,
 49,
 66,
 54,
 56,
 83,
 45,
 29,
 67,
 29,
 46,
 25,
 68,
 33,
 55,
 46,
 66,
 66,
 48,
 52,
 32,
 51,
 27,
 32,
 30,
 37,
 49,
 37,
 36,
 62,
 57,
 59,
 38,
 52.295206666666665,
 58,
 37,
 63,
 32,
 52,
 43,
 29,
 45,
 38,
 62,
 29,
 41,
 29,
 47,
 55,
 87,
 41,
 61,
 28,
 43,
 71,
 46,
 30,
 43,
 59,
 47,
 61,
 72,
 50,
 68,
 58,
 69,
 71,
 48,
 37,
 50,
 38,
 35,
 59,
 67,
 30,
 56,
 51,
 40,
 28,
 54,
 46,
 57,
 35,
 79,
 46,
 79,
 64,
 74,
 64,
 63,
 63,
 54,
 63,
 45,
 22,
 66,
 57,
 43,
 61,
 54,
 90,
 60,
 34,
 38,
 60,
 76,
 76,
 39,
 83,
 63,
 52,
 58,
 42,
 63,
 58,
 62,
 46,
 27,
 27,
 35,
 55,
 73,
 51,
 36,
 56,
 59,
 49,
 64,
 56,
 37,
 48,
 36,
 70,
 49,
 57,
 39,
 61,
 38,
 24,
 33,
 73,
 79,
 81,
 55,
 57,
 58,
 52,
 51,
 63,
 52.295206666666665,
 56,
 41,
 43,
 56,
 59,
 40,
 74,
 49,
 68,
 57,
 48,
 40,
 23,
 52,
 48,
 55,
 61,
 33,
 69,
 39,
 84,
 32,
 78,
 61,
 37,
 33,
 81,
 60,
 46,
 57,
 57,
 32,
 42,
 38,
 82,
 33,
 68,
 52,
 46,
 66,
 57,
 50,
 34,
 53,
 76,
 64,
 72,
 48,
 71,
 59,
 53,
 34,
 51,
 55,
 46,
 27,
 43,
 75,
 31,
 43,
 51,
 28,
 51,
 55,
 69,
 42,
 33,
 60,
 54,
 48,
 49,
 73,
 91,
 66,
 49,
 46,
 37,
 54,
 22,
 31,
 39,
 56,
 44,
 50,
 55,
 52,
 58,
 62,
 42,
 47,
 42,
 39,
 45,
 82,
 64,
 55,
 58,
 35,
 62,
 52,
 40,
 57,
 45,
 27,
 24,
 37,
 36,
 40,
 86,
 56,
 59,
 40,
 49,
 66,
 63,
 45,
 30,
 36,
 39,
 27,
 44,
 34,
 59,
 38,
 38,
 39,
 43,
 57,
 75,
 29,
 55,
 60,
 37,
 69,
 48,
 89,
 56,
 75,
 55,
 63,
 49,
 31,
 61,
 46,
 40,
 91,
 46,
 56,
 29,
 45,
 49,
 64,
 51,
 45,
 78,
 36,
 54,
 67,
 63,
 61,
 86,
 50,
 51,
 70,
 44,
 39,
 49,
 58,
 39,
 33,
 53,
 36,
 62,
 62,
 29,
 44,
 32,
 52,
 69,
 71,
 63,
 44,
 54,
 41,
 32,
 87,
 43,
 47,
 75,
 64,
 41,
 30,
 40,
 56,
 55,
 36,
 57,
 56,
 49,
 53,
 51,
 64,
 28,
 54,
 53,
 37,
 70,
 48,
 52,
 36,
 51,
 69,
 52,
 67,
 36,
 49,
 51,
 36,
 33,
 62,
 69,
 70,
 49,
 46,
 65,
 56,
 51,
 57,
 48,
 47,
 33,
 75,
 40,
 40,
 68,
 35,
 65,
 81,
 52,
 69,
 34,
 40,
 35,
 80,
 68,
 47,
 64,
 36,
 49,
 55,
 35,
 63,
 64,
 64,
 26,
 77,
 32,
 54,
 40,
 53,
 58,
 50,
 56,
 75,
 36,
 33,
 52,
 62,
 60,
 52,
 36,
 71,
 83,
 76,
 40,
 46,
 78,
 36,
 76,
 41,
 60,
 63,
 52,
 58,
 56,
 33,
 64,
 33,
 40,
 56,
 60,
 27,
 41,
 48,
 69,
 39,
 36,
 34,
 50,
 79,
 54,
 36,
 39,
 35,
 38,
 52,
 24,
 76,
 41,
 74,
 47,
 57,
 63,
 62,
 39,
 51,
 60,
 68,
 63,
 55,
 58,
 55,
 33,
 58,
 55,
 44,
 57,
 65,
 56,
 25,
 89,
 32,
 48,
 38,
 58,
 59,
 32,
 50,
 26,
 55,
 82,
 62,
 46,
 66,
 46,
 59,
 29,
 39,
 36,
 71,
 82,
 26,
 58,
 68,
 42,
 58,
 26,
 33,
 51,
 60,
 29,
 79,
 27,
 50,
 31,
 45,
 26,
 63,
 40,
 57,
 54,
 33,
 54,
 24,
 41,
 47,
 76,
 87,
 34,
 46,
 82,
 48,
 83,
 86,
 56,
 90,
 73,
 33,
 85,
 45,
 56,
 31,
 30,
 32,
 40,
 79,
 37,
 63,
 28,
 69,
 47,
 71,
 56,
 40,
 64,
 59,
 28,
 45,
 31,
 58,
 44,
 37,
 68,
 47,
 57,
 51,
 50,
 74,
 73,
 81,
 58,
 87,
 55,
 50,
 34,
 67,
 63,
 62,
 59,
 32,
 64,
 72,
 64,
 30,
 79,
 81,
 41,
 58,
 55,
 45,
 44,
 49,
 74,
 26,
 59,
 39,
 48,
 49,
 47,
 28,
 42,
 53,
 68,
 31,
 34,
 56,
 30,
 52,
 54,
 39,
 57,
 60,
 45,
 33,
 64,
 85,
 34,
 37,
 63,
 35,
 32,
 73,
 39,
 38,
 40,
 42,
 36,
 64,
 51,
 25,
 36,
 34,
 73,
 55,
 39,
 83,
 63,
 52,
 59,
 24,
 44,
 36,
 74,
 45,
 41,
 63,
 41,
 45,
 54,
 76,
 65,
 38,
 47,
 68,
 34,
 46,
 55,
 34,
 29,
 35,
 43,
 37,
 74,
 55,
 72,
 80,
 50,
 77,
 73,
 52,
 48,
 44,
 50,
 44,
 70,
 53,
 70,
 36,
 63,
 50,
 48,
 62,
 53,
 43,
 58,
 67,
 59,
 55,
 82,
 47,
 43,
 58,
 41,
 63,
 50,
 32,
 37,
 36,
 45,
 26,
 80,
 40,
 35,
 52,
 38,
 78,
 57,
 57,
 32,
 43,
 60,
 54,
 59,
 ...]

In [22]:
data.age = ageNew

In [23]:
"""RevolvingUtilizationOfUnsecuredLines = Total balance on credit cards and personal lines of credit 
except real estate and no installment debt like car loans divided by the sum of credit limits"""


Out[23]:
'RevolvingUtilizationOfUnsecuredLines = Total balance on credit cards and personal lines of credit \nexcept real estate and no installment debt like car loans divided by the sum of credit limits'

In [24]:
data.RevolvingUtilizationOfUnsecuredLines.describe()


Out[24]:
count    150000.000000
mean          6.048438
std         249.755371
min           0.000000
25%           0.029867
50%           0.154181
75%           0.559046
max       50708.000000
Name: RevolvingUtilizationOfUnsecuredLines, dtype: float64

In [25]:
len(data[data.RevolvingUtilizationOfUnsecuredLines >1])


Out[25]:
3321

In [26]:
"""the value here should be between 0-1 [implies 0 to 100%], but few values are more than 1 [implying more than 100%], so all those values must be a data entry error and should be changed to the value/100"""


Out[26]:
'the value here should be between 0-1 [implies 0 to 100%], but few values are more than 1 [implying more than 100%], so all those values must be a data entry error and should be changed to the value/100'

In [27]:
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val >1:
        data['RUUL_indicator']=1
    else:
        data['RUUL_indicator']=0

In [28]:
RUULNew=[]

for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val <=10:
        RUULNew.append(val/10)
    elif val >10 and val <=100:
        RUULNew.append(val/100)
    elif val >100 and val <=1000:
        RUULNew.append(val/1000)
    elif val >1000 and val <=10000:
        RUULNew.append(val/10000)
    elif val >10000 and val <=100000:   
        RUULNew.append(val/100000)
    else:
        RUULNew.append(val)

In [29]:
data.RevolvingUtilizationOfUnsecuredLines = RUULNew

In [30]:
"""NumberOfTime3059DaysPastDueNotWorse"""


Out[30]:
'NumberOfTime3059DaysPastDueNotWorse'

In [31]:
data.NumberOfTime3059DaysPastDueNotWorse.describe()


Out[31]:
count    150000.000000
mean          0.421033
std           4.192781
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime3059DaysPastDueNotWorse, dtype: float64

In [32]:
for i in range(0,100):
    print i, len(data[data.NumberOfTime3059DaysPastDueNotWorse == i])


0 126018
1 16033
2 4598
3 1754
4 747
5 342
6 140
7 54
8 25
9 12
10 4
11 1
12 2
13 1
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
30 0
31 0
32 0
33 0
34 0
35 0
36 0
37 0
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
49 0
50 0
51 0
52 0
53 0
54 0
55 0
56 0
57 0
58 0
59 0
60 0
61 0
62 0
63 0
64 0
65 0
66 0
67 0
68 0
69 0
70 0
71 0
72 0
73 0
74 0
75 0
76 0
77 0
78 0
79 0
80 0
81 0
82 0
83 0
84 0
85 0
86 0
87 0
88 0
89 0
90 0
91 0
92 0
93 0
94 0
95 0
96 5
97 0
98 264
99 0

In [33]:
"""looks 96 and 98 are outliers"""


Out[33]:
'looks 96 and 98 are outliers'

In [34]:
New = []
meanNOTT = data.NumberOfTime3059DaysPastDueNotWorse.mean()
for val in data.NumberOfTime3059DaysPastDueNotWorse:
    if ((val == 98) | (val == 96)):
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfTime3059DaysPastDueNotWorse = New

In [36]:
"""DebtRatio"""


Out[36]:
'DebtRatio'

In [37]:
data.DebtRatio.describe()


Out[37]:
count    150000.000000
mean        353.005076
std        2037.818523
min           0.000000
25%           0.175074
50%           0.366508
75%           0.868254
max      329664.000000
Name: DebtRatio, dtype: float64

In [38]:
len(data[data.DebtRatio > 1])


Out[38]:
35137

In [39]:
len(data[data.DebtRatio >0])


Out[39]:
145887

In [40]:
New = []
medianNOTT = data.DebtRatio.median()
for val in data.NumberRealEstateLoansOrLines:
    if val>1:
        New.append(medianNOTT)
    else:
        New.append(val)

data.DebtRatio = New

In [ ]:
"""NumberOfOpenCreditLinesAndLoans"""

In [41]:
data.NumberOfOpenCreditLinesAndLoans.describe()


Out[41]:
count    150000.000000
mean          8.452760
std           5.145951
min           0.000000
25%           5.000000
50%           8.000000
75%          11.000000
max          58.000000
Name: NumberOfOpenCreditLinesAndLoans, dtype: float64

In [42]:
data['NumberOfOpenCreditLinesAndLoans'].hist(bins=100)


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0xbed5ba8>

In [ ]:
"""NumberOfTimes90DaysLate"""

In [43]:
data.NumberOfTimes90DaysLate.describe()


Out[43]:
count    150000.000000
mean          0.265973
std           4.169304
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTimes90DaysLate, dtype: float64

In [44]:
for i in range(0,100):
    print i, len(data[data.NumberOfTimes90DaysLate == i])


0 141662
1 5243
2 1555
3 667
4 291
5 131
6 80
7 38
8 21
9 19
10 8
11 5
12 2
13 4
14 2
15 2
16 0
17 1
18 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
30 0
31 0
32 0
33 0
34 0
35 0
36 0
37 0
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
49 0
50 0
51 0
52 0
53 0
54 0
55 0
56 0
57 0
58 0
59 0
60 0
61 0
62 0
63 0
64 0
65 0
66 0
67 0
68 0
69 0
70 0
71 0
72 0
73 0
74 0
75 0
76 0
77 0
78 0
79 0
80 0
81 0
82 0
83 0
84 0
85 0
86 0
87 0
88 0
89 0
90 0
91 0
92 0
93 0
94 0
95 0
96 5
97 0
98 264
99 0

In [45]:
New = []
meanNOTT = data.NumberOfTimes90DaysLate.mean()
for val in data.NumberOfTimes90DaysLate:
    if ((val == 98) | (val == 96)):
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfTimes90DaysLate = New

In [46]:
"""NumberRealEstateLoansOrLines"""


Out[46]:
'NumberRealEstateLoansOrLines'

In [47]:
data.NumberRealEstateLoansOrLines.describe()


Out[47]:
count    150000.000000
mean          1.018240
std           1.129771
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          54.000000
Name: NumberRealEstateLoansOrLines, dtype: float64

In [48]:
for i in range(0,55):
    print i, len(data[data.NumberRealEstateLoansOrLines == i])


0 56188
1 52338
2 31522
3 6300
4 2170
5 689
6 320
7 171
8 93
9 78
10 37
11 23
12 18
13 15
14 7
15 7
16 4
17 4
18 2
19 2
20 2
21 1
22 0
23 2
24 0
25 3
26 1
27 0
28 0
29 1
30 0
31 0
32 1
33 0
34 0
35 0
36 0
37 0
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
49 0
50 0
51 0
52 0
53 0
54 1

In [49]:
New = []
meanNOTT = data.NumberRealEstateLoansOrLines.mean()
for val in data.NumberRealEstateLoansOrLines:
    if val>50:
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberRealEstateLoansOrLines = New

In [50]:
data.NumberRealEstateLoansOrLines.describe()


Out[50]:
count    150000.000000
mean          1.017887
std           1.121458
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          32.000000
Name: NumberRealEstateLoansOrLines, dtype: float64

In [51]:
"""NumberOfTime6089DaysPastDueNotWorse"""


Out[51]:
'NumberOfTime6089DaysPastDueNotWorse'

In [52]:
data.NumberOfTime6089DaysPastDueNotWorse.describe()


Out[52]:
count    150000.000000
mean          0.240387
std           4.155179
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime6089DaysPastDueNotWorse, dtype: float64

In [53]:
New = []
meanNOTT = data.NumberOfTime6089DaysPastDueNotWorse.mean()
for val in data.NumberOfTime6089DaysPastDueNotWorse:
    if ((val == 98) | (val == 96)):
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfTime6089DaysPastDueNotWorse = New

In [54]:
data.NumberOfTime6089DaysPastDueNotWorse.describe()


Out[54]:
count    150000.000000
mean          0.065138
std           0.329861
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          11.000000
Name: NumberOfTime6089DaysPastDueNotWorse, dtype: float64

In [55]:
"""NumberOfDependents"""


Out[55]:
'NumberOfDependents'

In [56]:
data.NumberOfDependents.describe()


Out[56]:
count    146076.000000
mean          0.757222
std           1.115086
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          20.000000
Name: NumberOfDependents, dtype: float64

In [57]:
for i in range(0,25):
    print i, len(data[data.NumberOfDependents == i])


0 86902
1 26316
2 19522
3 9483
4 2862
5 746
6 158
7 51
8 24
9 5
10 5
11 0
12 0
13 1
14 0
15 0
16 0
17 0
18 0
19 0
20 1
21 0
22 0
23 0
24 0

In [58]:
"""having more than 10 dependents looks weird"""


Out[58]:
'having more than 10 dependents looks weird'

In [59]:
New = []
meanNOTT = data.NumberOfDependents.mean()
for val in data.NumberOfDependents:
    if val>10:
        New.append(meanNOTT)
    else:
        New.append(val)

data.NumberOfDependents = New

In [60]:
data.NumberOfDependents.isnull().sum()


Out[60]:
3924

In [61]:
data['NumberOfDependents'] = data['NumberOfDependents'].fillna(0)

In [62]:
data.NumberOfDependents.describe()


Out[62]:
count    150000.000000
mean          0.737203
std           1.105450
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          10.000000
Name: NumberOfDependents, dtype: float64

In [63]:
"""MonthlyIncome"""


Out[63]:
'MonthlyIncome'

In [64]:
data.MonthlyIncome.describe()


Out[64]:
count    1.202690e+05
mean     6.670221e+03
std      1.438467e+04
min      0.000000e+00
25%      3.400000e+03
50%      5.400000e+03
75%      8.249000e+03
max      3.008750e+06
Name: MonthlyIncome, dtype: float64

In [65]:
train = data[data.MonthlyIncome.isnull() == False]
test = data[data.MonthlyIncome.isnull() == True]

In [66]:
train.shape, test.shape


Out[66]:
((120269, 12), (29731, 12))

In [67]:
X_train = train.drop(['MonthlyIncome', 'SeriousDlqin2yrs'], axis=1)
y_train = train.MonthlyIncome
X_test = test.drop(['MonthlyIncome', 'SeriousDlqin2yrs'], axis=1)

In [68]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [69]:
lmMod = LinearRegression(fit_intercept=True, normalize=True).fit(X_train, y_train)

In [70]:
lmMod.coef_


Out[70]:
array([-3948.4507996 ,    32.0281422 ,  -103.40827549,   -72.3226233 ,
         108.72297805,  -145.64343546,  1261.74822934,  -153.67162041,
         720.53837969,     0.        ])

In [71]:
pred = lmMod.predict(X_test)

In [72]:
predNoZero = []
for val in pred:
    if val >= 0:
        predNoZero.append(val)
    else:
        predNoZero.append(0.)

In [73]:
testFull = data[data.MonthlyIncome.isnull() == True]

In [74]:
testFull['MonthlyIncome'] = predNoZero


C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [76]:
monNew = []
for index in data.index:
    if data.MonthlyIncome[index].is_integer() == True:
        monNew.append(data.MonthlyIncome[index])
    else:
        monNew.append(testFull.MonthlyIncome[index])

In [77]:
testFull.MonthlyIncome.isnull().sum()


Out[77]:
0

In [78]:
data.MonthlyIncome = monNew

In [79]:
"""FEATURE ENGINEERING"""


Out[79]:
'FEATURE ENGINEERING'

In [80]:
data.MonthlyIncome.describe()


Out[80]:
count    1.500000e+05
mean     6.546643e+03
std      1.290915e+04
min      0.000000e+00
25%      3.734000e+03
50%      5.524000e+03
75%      7.869294e+03
max      3.008750e+06
Name: MonthlyIncome, dtype: float64

In [81]:
"""No Income Variable Indicator"""


Out[81]:
'No Income Variable Indicator'

In [82]:
for val in data.MonthlyIncome:
    if val <=0:
        data['NoIncome_MI_indicator']=1
    else:
        data['NoIncome_MI_indicator']=0

In [83]:
"""Zero Debt Ratio Indicator"""


Out[83]:
'Zero Debt Ratio Indicator'

In [84]:
for val in data.DebtRatio:
    if val <=0:
        data['No_DebtRatio_indicator']=1
    else:
        data['No_DebtRatio_indicator']=0

In [85]:
"""Monthly Income is Zero, But Debt Ratio is non-zero = 1"""


Out[85]:
'Monthly Income is Zero, But Debt Ratio is non-zero = 1'

In [86]:
data['MIz_DRnz']=map(lambda x,y: 1 if (x==0 and y>0) else 0, data['MonthlyIncome'], data['DebtRatio'])

In [87]:
"""Monthly Income is Zero, But Debt Ratio is zero = 1"""


Out[87]:
'Monthly Income is Zero, But Debt Ratio is zero = 1'

In [88]:
data['MIz_DRz']=map(lambda x,y: 1 if (x==0 and y==0) else 0, data['MonthlyIncome'], data['DebtRatio'])

In [89]:
"""Monthly Income is Non-Zero, But Debt Ratio is zero = 1"""


Out[89]:
'Monthly Income is Non-Zero, But Debt Ratio is zero = 1'

In [90]:
data['MInz_DRz']=map(lambda x,y: 1 if (x>0 and y==0) else 0, data['MonthlyIncome'], data['DebtRatio'])

In [91]:
"""Zero Revolving Utilization when Revolving Utilization Of Unsecured Lines == 0"""


Out[91]:
'Zero Revolving Utilization when Revolving Utilization Of Unsecured Lines == 0'

In [92]:
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val <=0:
        data['ZeroRevolvingUtilization']=1
    else:
        data['ZeroRevolvingUtilization']=0

In [93]:
"""debtRatio * Monthly Income = DR_MI"""


Out[93]:
'debtRatio * Monthly Income = DR_MI'

In [94]:
#All 0 MI to 1 so that DR dont become 0 
for val in data.MonthlyIncome:
    if val ==0:
        MIZ=1
    else:
        MIZ=val

In [95]:
data['DR_MI']=MIZ*data['DebtRatio']

In [96]:
data.DR_MI.describe()


Out[96]:
count    150000.000000
mean       3673.196399
std        3493.021586
min           0.000000
25%           0.000000
50%        2989.970967
75%        8158.000000
max        8158.000000
Name: DR_MI, dtype: float64

In [97]:
from math import *

In [98]:
data['Log_DR_MI']=np.log(data.DR_MI)


C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: RuntimeWarning: divide by zero encountered in log
  if __name__ == '__main__':

In [99]:
"""Log of DebtRatio*MonthlyIncome"""


Out[99]:
'Log of DebtRatio*MonthlyIncome'

In [101]:
data_new2=data

In [102]:
data['Log_DR_MI']=data.Log_DR_MI.replace([np.inf, -np.inf], 0)

In [103]:
"""  RevolvingLines = NumberOfOpenCreditLinesAndLoans - NumberRealEstateLoansOrLines
"""


Out[103]:
'  RevolvingLines = NumberOfOpenCreditLinesAndLoans - NumberRealEstateLoansOrLines\n'

In [104]:
data['RevolvingLines']=data['NumberOfOpenCreditLinesAndLoans']-data['NumberRealEstateLoansOrLines']

In [105]:
"""HasRealEstateLoans = NumberRealEstateLoansOrLines > 0)"""


Out[105]:
'HasRealEstateLoans = NumberRealEstateLoansOrLines > 0)'

In [106]:
HRSL=[]
for val in data.NumberRealEstateLoansOrLines:
    if val >0:
        HRSL.append(1)
    else:
        HRSL.append(0)
        
data['HasRealEstateLoans']=HRSL

In [107]:
"""HasMultipleRealEstateLoans = NumberRealEstateLoansOrLines > 2"""


Out[107]:
'HasMultipleRealEstateLoans = NumberRealEstateLoansOrLines > 2'

In [108]:
MHRSL=[]
for val in data.NumberRealEstateLoansOrLines:
    if val >2:
        MHRSL.append(1)
    else:
        MHRSL.append(0)
        
data['HasMultipleRealEstateLoans ']=MHRSL

In [109]:
"""DisposableIncome = (1 - DebtRatio) * MonthlyIncome"""


Out[109]:
'DisposableIncome = (1 - DebtRatio) * MonthlyIncome'

In [110]:
data['DisposableIncome']=(1-data['DebtRatio'])*data['MonthlyIncome']

In [111]:
"""RevolvingToRealEstate  = RevolvingLines / (1 + NumberRealEstateLoansOrLines)"""


Out[111]:
'RevolvingToRealEstate  = RevolvingLines / (1 + NumberRealEstateLoansOrLines)'

In [112]:
data['RevolvingToRealEstate']=data['RevolvingLines'] / (1+data['NumberRealEstateLoansOrLines'])

In [113]:
"""FullUtilization = RevolvingUtilizationOfUnsecuredLines == 1)
  ExcessUtilization = RevolvingUtilizationOfUnsecuredLines > 1)"""


Out[113]:
'FullUtilization = RevolvingUtilizationOfUnsecuredLines == 1)\n  ExcessUtilization = RevolvingUtilizationOfUnsecuredLines > 1)'

In [114]:
FU=[]
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val ==1:
        FU.append(1)
    else:
        FU.append(0)
        
data['FullUtilization']=FU

In [115]:
EU=[]
for val in data.RevolvingUtilizationOfUnsecuredLines:
    if val >1:
        EU.append(1)
    else:
        EU.append(0)
        
data['ExcessUtilization']=EU

In [116]:
"""
  RevolvingLinesPerPerson = RevolvingLines / (1 + NumberOfDependents)
  RealEstateLoansPerPerson = NumberRealEstateLoansOrLines / (1 + NumberOfDependents)
  IncomePerDependent = 1+NumberOfDependents/MonthlyIncome+1
  """


Out[116]:
'\n  RevolvingLinesPerPerson = RevolvingLines / (1 + NumberOfDependents)\n  RealEstateLoansPerPerson = NumberRealEstateLoansOrLines / (1 + NumberOfDependents)\n  IncomePerDependent = 1+NumberOfDependents/MonthlyIncome+1\n  '

In [117]:
data['RevolvingLinesPerPerson'] = data['RevolvingLines'] / (1+data['NumberOfDependents'])
data['RealEstateLoanPerPerson'] = data['NumberRealEstateLoansOrLines'] / (1+data['NumberOfDependents'])

In [118]:
data['IncomePerDependent']=(1+data['NumberOfDependents']) / (1+data['MonthlyIncome'])

In [119]:
"""NumberOfTimePastDue"""


Out[119]:
'NumberOfTimePastDue'

In [120]:
data['NumberOfTimePastDue']=data['NumberOfTime3059DaysPastDueNotWorse']+data['NumberOfTime6089DaysPastDueNotWorse']+data['NumberOfTimes90DaysLate']

In [121]:
"""DelinquenciesPerLine  = NumberOfTimesPastDue / NumberOfOpenCreditLinesAndLoans"""


Out[121]:
'DelinquenciesPerLine  = NumberOfTimesPastDue / NumberOfOpenCreditLinesAndLoans'

In [122]:
data['DelinquenciesPerLine']=data['NumberOfTimePastDue'] /data['NumberOfOpenCreditLinesAndLoans']

In [123]:
data_new3=data

In [124]:
data['DelinquenciesPerLine']=data.DelinquenciesPerLine.replace([np.inf, -np.inf], np.NaN)
data.DelinquenciesPerLine[np.isnan(data.DelinquenciesPerLine)] = 0
data.DelinquenciesPerLine[np.isinf(data.DelinquenciesPerLine)] = 0
data['DelinquenciesPerLine']=data['DelinquenciesPerLine'].fillna(0)


C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [125]:
data['DelinquenciesPerLine']=data.DelinquenciesPerLine.replace([np.inf, -np.inf], 0)

In [126]:
data.DelinquenciesPerLine.isnull().sum()


Out[126]:
0

In [127]:
"""DelinquenciesPerRevolvingLine  = NumberOfTimesPastDue / RevolvingLines"""


Out[127]:
'DelinquenciesPerRevolvingLine  = NumberOfTimesPastDue / RevolvingLines'

In [128]:
data['DelinquenciesPerRevolvingLine'] = data['NumberOfTimePastDue'] / data['RevolvingLines']

In [131]:
data['DelinquenciesPerRevolvingLine']=data.DelinquenciesPerRevolvingLine.replace([np.inf, -np.inf], np.NaN)
data.DelinquenciesPerRevolvingLine[np.isnan(data.DelinquenciesPerRevolvingLine)] = 0
data.DelinquenciesPerRevolvingLine[np.isinf(data.DelinquenciesPerRevolvingLine)] = 0
data['DelinquenciesPerRevolvingLine']=data['DelinquenciesPerRevolvingLine'].fillna(0)


C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
C:\ProgramData\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [132]:
from sklearn.cross_validation import train_test_split


C:\ProgramData\Anaconda2\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [133]:
data_new4=data

In [134]:
X = data.drop('SeriousDlqin2yrs', axis=1)
y = data.SeriousDlqin2yrs

In [135]:
#np.savetxt("C:/Users/saga54/Desktop/foo.csv", data, delimiter=",")

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [144]:
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingClassifier

In [145]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)

In [146]:
y_pred = clf.fit(X_train, y_train).predict(X_test)

In [147]:
from sklearn.metrics import confusion_matrix

In [148]:
confusion_matrix(y_test, y_pred)


Out[148]:
array([[33865,  1185],
       [ 1608,   842]])

In [149]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


Out[149]:
0.92552000000000001

In [150]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))


             precision    recall  f1-score   support

    class 0       0.95      0.97      0.96     35050
    class 1       0.42      0.34      0.38      2450

avg / total       0.92      0.93      0.92     37500


In [151]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)


Out[151]:
0.65493231243995464

In [152]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)


Out[152]:
0.65493231243995464

In [153]:
""""colsample_bytree": 0.41,
      "gamma": 0.643,
      "max_depth": 5,
      "max_delta_step": 1.78,
      "min_child_weight": 10.0,
      "objective": "binary:logistic",
      "subsample": 0.801,
      "learning_rate": 0.027,
      "silent": false,
      "nthread": 7,
      "n_estimators": 295,
      "seed": 2"""


Out[153]:
'"colsample_bytree": 0.41,\n      "gamma": 0.643,\n      "max_depth": 5,\n      "max_delta_step": 1.78,\n      "min_child_weight": 10.0,\n      "objective": "binary:logistic",\n      "subsample": 0.801,\n      "learning_rate": 0.027,\n      "silent": false,\n      "nthread": 7,\n      "n_estimators": 295,\n      "seed": 2'

In [155]:
data.to_csv("C:/Users/saga54/Desktop/gmc.csv")

In [ ]:
"""RANDOM FOREST CLASSIFIER"""

In [200]:
from sklearn.ensemble import RandomForestClassifier
clfRF = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)

In [201]:
y_pred = clfRF.fit(X_train, y_train).predict(X_test)

In [202]:
confusion_matrix(y_test, y_pred)


Out[202]:
array([[34607,   443],
       [ 1985,   465]])

In [203]:
accuracy_score(y_test, y_pred)


Out[203]:
0.93525333333333338

In [204]:
clfRF.feature_importances_


Out[204]:
array([ 0.14185544,  0.10717513,  0.01953287,  0.00309399,  0.11670124,
        0.03154686,  0.048104  ,  0.00772096,  0.0234183 ,  0.02028825,
        0.        ,  0.        ,  0.        ,  0.000473  ,  0.00032052,
        0.00160409,  0.        ,  0.00288444,  0.00298045,  0.02959041,
        0.00145809,  0.00195608,  0.08125582,  0.03305014,  0.        ,
        0.        ,  0.04334402,  0.01604187,  0.11711582,  0.05117611,
        0.05440517,  0.04290692])

In [205]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))


             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.51      0.19      0.28      2450

avg / total       0.92      0.94      0.92     37500


In [206]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)


Out[206]:
0.58857841567440106

In [207]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)


Out[207]:
0.58857841567440106

In [ ]:
"""ADA BOOSTING CLASSIFIER"""

In [208]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clfADA = AdaBoostClassifier(n_estimators=100)

In [209]:
y_pred = clfADA.fit(X_train, y_train).predict(X_test)

In [210]:
confusion_matrix(y_test, y_pred)


Out[210]:
array([[34626,   424],
       [ 1914,   536]])

In [211]:
accuracy_score(y_test, y_pred)


Out[211]:
0.93765333333333334

In [212]:
clfADA.feature_importances_


Out[212]:
array([ 0.22,  0.11,  0.01,  0.  ,  0.09,  0.04,  0.02,  0.05,  0.02,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.01,  0.  ,  0.  ,  0.  ,
        0.  ,  0.02,  0.  ,  0.01,  0.06,  0.03,  0.  ,  0.  ,  0.06,
        0.02,  0.07,  0.11,  0.04,  0.01])

In [214]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))


             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.56      0.22      0.31      2450

avg / total       0.92      0.94      0.92     37500


In [215]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)


Out[215]:
0.60333925296224056

In [216]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)


Out[216]:
0.60333925296224056

In [ ]:
"""ENSEMBLE 1"""

In [218]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier


clf1 = GradientBoostingClassifier(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

In [219]:
for clf, label in zip([clf1, clf2, clf3, eclf], ['Gradient Boosting Classifier', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))


Accuracy: 0.94 (+/- 0.00) [Gradient Boosting Classifier]
Accuracy: 0.93 (+/- 0.00) [Random Forest]
Accuracy: 0.92 (+/- 0.00) [naive Bayes]
Accuracy: 0.93 (+/- 0.00) [Ensemble]

In [220]:
"""Ensemble 2"""


Out[220]:
'Ensemble 2'

In [221]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=1)
clf2 = RandomForestClassifier(n_estimators=100, random_state=1)
clf3 = AdaBoostClassifier(n_estimators=100)
X = X_train
y = y_train
eclf1 = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(X, y)

y_pred1=eclf1.predict(X_test)


eclf2 = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft')
eclf2 = eclf2.fit(X, y)

y_pred2=eclf2.predict(X_test)

eclf3 = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[2,1,1])
eclf3 = eclf3.fit(X, y)
y_pred3=eclf3.predict(X_test)

In [222]:
confusion_matrix(y_test, y_pred1)

accuracy_score(y_test, y_pred1)


Out[222]:
0.93773333333333331

In [223]:
confusion_matrix(y_test, y_pred2)

accuracy_score(y_test, y_pred2)


Out[223]:
0.92554666666666663

In [224]:
confusion_matrix(y_test, y_pred3)

accuracy_score(y_test, y_pred3)


Out[224]:
0.92552000000000001

In [225]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred1, target_names=target_names))


             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.55      0.24      0.34      2450

avg / total       0.92      0.94      0.93     37500


In [226]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred1)


Out[226]:
0.61363212902850151

In [227]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred1)
metrics.auc(fpr, tpr)


Out[227]:
0.61363212902850151

In [228]:
"""Ensemble 3"""


Out[228]:
'Ensemble 3'

In [229]:
from sklearn.model_selection import GridSearchCV
clf1 = GradientBoostingClassifier(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = AdaBoostClassifier()
eclf = VotingClassifier(estimators=[('gb', clf1), ('rf', clf2), ('ab', clf3)], voting='soft')

params = {'rf__n_estimators': [20, 200],}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X_train, y_train)

In [230]:
y_pred=grid.predict(X_test)

In [231]:
confusion_matrix(y_test, y_pred)


Out[231]:
array([[34734,   316],
       [ 1985,   465]])

In [232]:
accuracy_score(y_test, y_pred)


Out[232]:
0.93864000000000003

In [233]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))


             precision    recall  f1-score   support

    class 0       0.95      0.99      0.97     35050
    class 1       0.60      0.19      0.29      2450

avg / total       0.92      0.94      0.92     37500


In [234]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)


Out[234]:
0.59039011324929402

In [235]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)


Out[235]:
0.59039011324929402

In [ ]: