In [34]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import KMeans
from sklearn import mixture

from sklearn.cross_validation import train_test_split

%matplotlib inline

first glance



In [35]:

    
!head ../data/loan_dataset.csv

load data



In [36]:

    
df = pd.read_csv("../data/loan_dataset.csv", na_values = ['?'])
df.info()
df.head(10)









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10010 entries, 0 to 10009
Data columns (total 16 columns):
id                10010 non-null int64
Age               10008 non-null float64
Work Class        9406 non-null object
FnlWgt            10008 non-null float64
Education         10010 non-null object
Education Num     10010 non-null int64
Maried Status     10010 non-null object
Occupation        9405 non-null object
Relationship      10010 non-null object
Race              10010 non-null object
Gender            10010 non-null object
Capital Gain      10010 non-null int64
Capital Loss      10010 non-null int64
hours per wk      10010 non-null int64
Native Country    9839 non-null object
APPROVE/NOT       10010 non-null int64
dtypes: float64(2), int64(6), object(8)
memory usage: 1.2+ MB






    Out[36]:






  
    
      
      id
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      Capital Gain
      Capital Loss
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      0
      1
      25.0
      Private
      226802.0
      11th
      7
      Never-married
      Machine-op-inspct
      Own-child
      Black
      Male
      0
      0
      40
      United-States
      0
    
    
      1
      2
      38.0
      Private
      89814.0
      HS-grad
      9
      Married-civ-spouse
      Farming-fishing
      Husband
      White
      Male
      0
      0
      50
      United-States
      0
    
    
      2
      3
      28.0
      Local-gov
      336951.0
      Assoc-acdm
      10000
      Married-civ-spouse
      Protective-serv
      Husband
      White
      Male
      0
      0
      40
      United-States
      1
    
    
      3
      4
      44.0
      Private
      160323.0
      Some-college
      10
      Married-civ-spouse
      Machine-op-inspct
      Husband
      Black
      Male
      7688
      0
      40
      United-States
      1
    
    
      4
      5
      18.0
      NaN
      103497.0
      Some-college
      10
      Never-married
      NaN
      Own-child
      White
      Female
      0
      0
      30
      United-States
      0
    
    
      5
      6
      34.0
      Private
      198693.0
      10th
      6
      Never-married
      Other-service
      Not-in-family
      White
      Male
      0
      0
      30
      United-States
      0
    
    
      6
      7
      29.0
      NaN
      227026.0
      HS-grad
      9
      Never-married
      NaN
      Unmarried
      Black
      Male
      0
      0
      40
      United-States
      0
    
    
      7
      8
      63.0
      Self-emp-not-inc
      104626.0
      Prof-school
      15
      Married-civ-spouse
      Prof-specialty
      Husband
      White
      Male
      3103
      0
      32
      United-States
      1
    
    
      8
      9
      24.0
      Private
      369667.0
      Some-college
      10
      Never-married
      Other-service
      Unmarried
      White
      Female
      0
      0
      40
      United-States
      0
    
    
      9
      10
      55.0
      Private
      104996.0
      7th-8th
      4
      Married-civ-spouse
      Craft-repair
      Husband
      White
      Male
      0
      0
      10
      United-States
      0



In [37]:

    
s = df['APPROVE/NOT'].value_counts()
approve_rate = s / float(sum(s))
print("apprive rate：%.3f%%"  %(approve_rate[1] * 100))









    



apprive rate：23.566%

aata wrangling

remove useless column



In [38]:

    
df.drop(df.columns[[0]], axis = 1, inplace = True)
df.info()
df.head(10)









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10010 entries, 0 to 10009
Data columns (total 15 columns):
Age               10008 non-null float64
Work Class        9406 non-null object
FnlWgt            10008 non-null float64
Education         10010 non-null object
Education Num     10010 non-null int64
Maried Status     10010 non-null object
Occupation        9405 non-null object
Relationship      10010 non-null object
Race              10010 non-null object
Gender            10010 non-null object
Capital Gain      10010 non-null int64
Capital Loss      10010 non-null int64
hours per wk      10010 non-null int64
Native Country    9839 non-null object
APPROVE/NOT       10010 non-null int64
dtypes: float64(2), int64(5), object(8)
memory usage: 1.1+ MB






    Out[38]:






  
    
      
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      Capital Gain
      Capital Loss
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      0
      25.0
      Private
      226802.0
      11th
      7
      Never-married
      Machine-op-inspct
      Own-child
      Black
      Male
      0
      0
      40
      United-States
      0
    
    
      1
      38.0
      Private
      89814.0
      HS-grad
      9
      Married-civ-spouse
      Farming-fishing
      Husband
      White
      Male
      0
      0
      50
      United-States
      0
    
    
      2
      28.0
      Local-gov
      336951.0
      Assoc-acdm
      10000
      Married-civ-spouse
      Protective-serv
      Husband
      White
      Male
      0
      0
      40
      United-States
      1
    
    
      3
      44.0
      Private
      160323.0
      Some-college
      10
      Married-civ-spouse
      Machine-op-inspct
      Husband
      Black
      Male
      7688
      0
      40
      United-States
      1
    
    
      4
      18.0
      NaN
      103497.0
      Some-college
      10
      Never-married
      NaN
      Own-child
      White
      Female
      0
      0
      30
      United-States
      0
    
    
      5
      34.0
      Private
      198693.0
      10th
      6
      Never-married
      Other-service
      Not-in-family
      White
      Male
      0
      0
      30
      United-States
      0
    
    
      6
      29.0
      NaN
      227026.0
      HS-grad
      9
      Never-married
      NaN
      Unmarried
      Black
      Male
      0
      0
      40
      United-States
      0
    
    
      7
      63.0
      Self-emp-not-inc
      104626.0
      Prof-school
      15
      Married-civ-spouse
      Prof-specialty
      Husband
      White
      Male
      3103
      0
      32
      United-States
      1
    
    
      8
      24.0
      Private
      369667.0
      Some-college
      10
      Never-married
      Other-service
      Unmarried
      White
      Female
      0
      0
      40
      United-States
      0
    
    
      9
      55.0
      Private
      104996.0
      7th-8th
      4
      Married-civ-spouse
      Craft-repair
      Husband
      White
      Male
      0
      0
      10
      United-States
      0



In [39]:

    
df.describe(include = 'all')









    Out[39]:






  
    
      
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      Capital Gain
      Capital Loss
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      count
      10008.000000
      9406
      1.000800e+04
      10010
      10010.000000
      10010
      9405
      10010
      10010
      10010
      10010.000000
      10010.000000
      10010.000000
      9839
      10010.000000
    
    
      unique
      NaN
      8
      NaN
      16
      NaN
      7
      14
      6
      5
      2
      NaN
      NaN
      NaN
      40
      NaN
    
    
      top
      NaN
      Private
      NaN
      HS-grad
      NaN
      Married-civ-spouse
      Prof-specialty
      Husband
      White
      Male
      NaN
      NaN
      NaN
      United-States
      NaN
    
    
      freq
      NaN
      6849
      NaN
      3241
      NaN
      4505
      1269
      3976
      8589
      6681
      NaN
      NaN
      NaN
      9038
      NaN
    
    
      mean
      38.759592
      NaN
      1.883641e+05
      NaN
      11.070729
      NaN
      NaN
      NaN
      NaN
      NaN
      1139.195205
      88.639461
      40.471728
      NaN
      0.235664
    
    
      std
      13.873285
      NaN
      1.050199e+05
      NaN
      99.882405
      NaN
      NaN
      NaN
      NaN
      NaN
      7996.332111
      405.248276
      12.422044
      NaN
      0.424435
    
    
      min
      17.000000
      NaN
      0.000000e+00
      NaN
      1.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      1.000000
      NaN
      0.000000
    
    
      25%
      28.000000
      NaN
      1.157595e+05
      NaN
      9.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      40.000000
      NaN
      0.000000
    
    
      50%
      37.000000
      NaN
      1.777905e+05
      NaN
      10.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      40.000000
      NaN
      0.000000
    
    
      75%
      48.000000
      NaN
      2.375080e+05
      NaN
      12.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      45.000000
      NaN
      0.000000
    
    
      max
      90.000000
      NaN
      1.490400e+06
      NaN
      10000.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      99999.000000
      3770.000000
      99.000000
      NaN
      1.000000

missing value handling



In [40]:

    
def count_missing(x):
  return sum(x.isnull()) / float(len(x))

print "Missing Value Statistics"
print df.apply(count_missing, axis = 0)









    



Missing Value Statistics
Age               0.000200
Work Class        0.060340
FnlWgt            0.000200
Education         0.000000
Education Num     0.000000
Maried Status     0.000000
Occupation        0.060440
Relationship      0.000000
Race              0.000000
Gender            0.000000
Capital Gain      0.000000
Capital Loss      0.000000
hours per wk      0.000000
Native Country    0.017083
APPROVE/NOT       0.000000
dtype: float64



In [41]:

    
df.fillna(value = {'Native Country' : 'Others'}, inplace = True)
df.fillna(value = {'Occupation' : 'Others'}, inplace = True)
df.fillna(value = {'Work Class' : 'Others'}, inplace = True)

df.head()









    Out[41]:






  
    
      
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      Capital Gain
      Capital Loss
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      0
      25.0
      Private
      226802.0
      11th
      7
      Never-married
      Machine-op-inspct
      Own-child
      Black
      Male
      0
      0
      40
      United-States
      0
    
    
      1
      38.0
      Private
      89814.0
      HS-grad
      9
      Married-civ-spouse
      Farming-fishing
      Husband
      White
      Male
      0
      0
      50
      United-States
      0
    
    
      2
      28.0
      Local-gov
      336951.0
      Assoc-acdm
      10000
      Married-civ-spouse
      Protective-serv
      Husband
      White
      Male
      0
      0
      40
      United-States
      1
    
    
      3
      44.0
      Private
      160323.0
      Some-college
      10
      Married-civ-spouse
      Machine-op-inspct
      Husband
      Black
      Male
      7688
      0
      40
      United-States
      1
    
    
      4
      18.0
      Others
      103497.0
      Some-college
      10
      Never-married
      Others
      Own-child
      White
      Female
      0
      0
      30
      United-States
      0



In [42]:

    
df.dropna(inplace = True)



In [43]:

    
print df.apply(count_missing, axis = 0)









    



Age               0.0
Work Class        0.0
FnlWgt            0.0
Education         0.0
Education Num     0.0
Maried Status     0.0
Occupation        0.0
Relationship      0.0
Race              0.0
Gender            0.0
Capital Gain      0.0
Capital Loss      0.0
hours per wk      0.0
Native Country    0.0
APPROVE/NOT       0.0
dtype: float64

outlier handling



In [44]:

    
print len(df['Native Country'].unique())
country_cnt = df['Native Country'].value_counts(sort = True)

for (key, val) in country_cnt.iteritems():
    if val < 30:
        df['Native Country'].replace(key, 'Others', inplace = True)
country_cnt = df['Native Country'].value_counts(sort = True)
country_cnt









    



41






    Out[44]:





United-States    9034
Others            539
Mexico            193
Philippines        53
Puerto-Rico        44
Germany            41
India              36
Canada             35
El-Salvador        31
Name: Native Country, dtype: int64



In [45]:

    
print len(df['Occupation'].unique())
occupation_cnt = df['Occupation'].value_counts(sort = True)
occupation_cnt

for (key, val) in occupation_cnt.iteritems():
    if val < 30:
        df['Occupation'].replace(key, 'Others', inplace = True)
occupation_cnt = df['Occupation'].value_counts(sort = True)
occupation_cnt









    



15






    Out[45]:





Prof-specialty       1267
Exec-managerial      1266
Craft-repair         1230
Adm-clerical         1118
Sales                1111
Other-service        1006
Machine-op-inspct     645
Others                608
Transport-moving      448
Handlers-cleaners     436
Farming-fishing       316
Tech-support          297
Protective-serv       199
Priv-house-serv        59
Name: Occupation, dtype: int64



In [46]:

    
print len(df['Work Class'].unique())
work_cnt = df['Work Class'].value_counts(sort = True)
work_cnt

for (key, val) in work_cnt.iteritems():
    if val < 30:
        df['Work Class'].replace(key, 'Others', inplace = True)
work_cnt = df['Work Class'].value_counts(sort = True)
work_cnt









    



9






    Out[46]:





Private             6847
Self-emp-not-inc     814
Local-gov            633
Others               609
State-gov            445
Self-emp-inc         369
Federal-gov          289
Name: Work Class, dtype: int64



In [47]:

    
df.describe(include = 'all', percentiles = [0.25, 0.5, 0.75, 0.997])









    Out[47]:






  
    
      
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      Capital Gain
      Capital Loss
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      count
      10006.000000
      10006
      1.000600e+04
      10006
      10006.000000
      10006
      10006
      10006
      10006
      10006
      10006.000000
      10006.000000
      10006.000000
      10006
      10006.000000
    
    
      unique
      NaN
      7
      NaN
      16
      NaN
      7
      14
      6
      5
      2
      NaN
      NaN
      NaN
      9
      NaN
    
    
      top
      NaN
      Private
      NaN
      HS-grad
      NaN
      Married-civ-spouse
      Prof-specialty
      Husband
      White
      Male
      NaN
      NaN
      NaN
      United-States
      NaN
    
    
      freq
      NaN
      6847
      NaN
      3240
      NaN
      4502
      1267
      3974
      8585
      6679
      NaN
      NaN
      NaN
      9034
      NaN
    
    
      mean
      38.760344
      NaN
      1.883586e+05
      NaN
      11.070358
      NaN
      NaN
      NaN
      NaN
      NaN
      1138.921247
      88.674895
      40.466920
      NaN
      0.235559
    
    
      std
      13.873849
      NaN
      1.050012e+05
      NaN
      99.902352
      NaN
      NaN
      NaN
      NaN
      NaN
      7997.669072
      405.325401
      12.414652
      NaN
      0.424369
    
    
      min
      17.000000
      NaN
      0.000000e+00
      NaN
      1.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      1.000000
      NaN
      0.000000
    
    
      25%
      28.000000
      NaN
      1.157790e+05
      NaN
      9.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      40.000000
      NaN
      0.000000
    
    
      50%
      37.000000
      NaN
      1.777905e+05
      NaN
      10.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      40.000000
      NaN
      0.000000
    
    
      75%
      48.000000
      NaN
      2.375018e+05
      NaN
      12.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      0.000000
      0.000000
      45.000000
      NaN
      0.000000
    
    
      99.7%
      80.000000
      NaN
      6.564667e+05
      NaN
      16.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      99999.000000
      2415.000000
      98.000000
      NaN
      1.000000
    
    
      max
      90.000000
      NaN
      1.490400e+06
      NaN
      10000.000000
      NaN
      NaN
      NaN
      NaN
      NaN
      99999.000000
      3770.000000
      99.000000
      NaN
      1.000000



In [48]:

    
age_list = df.Age.tolist()
cats = pd.qcut(age_list, 5)
cats









    Out[48]:





[[17, 26], (33, 41], (26, 33], (41, 51], [17, 26], ..., (26, 33], (51, 90], (41, 51], (51, 90], (41, 51]]
Length: 10006
Categories (5, object): [[17, 26] < (26, 33] < (33, 41] < (41, 51] < (51, 90]]



In [49]:

    
def age_bin(x):
    if x <= 26:
        return 0
    elif x > 26 and x <= 33:
        return 1
    elif x > 33 and x <= 41:
        return 2
    elif x > 41 and x <= 60:
        return 3
    else:
        return 4
df.Age = df['Age'].apply(age_bin)
df['Age'] = df['Age'].astype('category')
df.head()









    Out[49]:






  
    
      
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      Capital Gain
      Capital Loss
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      0
      0
      Private
      226802.0
      11th
      7
      Never-married
      Machine-op-inspct
      Own-child
      Black
      Male
      0
      0
      40
      United-States
      0
    
    
      1
      2
      Private
      89814.0
      HS-grad
      9
      Married-civ-spouse
      Farming-fishing
      Husband
      White
      Male
      0
      0
      50
      United-States
      0
    
    
      2
      1
      Local-gov
      336951.0
      Assoc-acdm
      10000
      Married-civ-spouse
      Protective-serv
      Husband
      White
      Male
      0
      0
      40
      United-States
      1
    
    
      3
      3
      Private
      160323.0
      Some-college
      10
      Married-civ-spouse
      Machine-op-inspct
      Husband
      Black
      Male
      7688
      0
      40
      United-States
      1
    
    
      4
      0
      Others
      103497.0
      Some-college
      10
      Never-married
      Others
      Own-child
      White
      Female
      0
      0
      30
      United-States
      0



In [50]:

    
df = df[df['Education Num'] <= 16]
df['Education Num'].hist(bins = 10)









    Out[50]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd46e23b4d0>



In [51]:

    
edu_list = df['Education Num'].tolist()
edu_cats = pd.qcut(edu_list, 4)
edu_cats









    Out[51]:





[[1, 9], [1, 9], (9, 10], (9, 10], [1, 9], ..., [1, 9], (9, 10], [1, 9], [1, 9], (10, 12]]
Length: 10005
Categories (4, object): [[1, 9] < (9, 10] < (10, 12] < (12, 16]]



In [52]:

    
def edu_bin(x):
    if x <= 9:
        return 0
    elif x > 9 and x <= 10:
        return 1
    elif x > 10 and x <= 12:
        return 2
    elif x > 12:
        return 3
df['Education Num'] = df['Education Num'].apply(edu_bin)
df['Education Num'] = df['Education Num'].astype('category')
df.head()









    Out[52]:






  
    
      
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      Capital Gain
      Capital Loss
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      0
      0
      Private
      226802.0
      11th
      0
      Never-married
      Machine-op-inspct
      Own-child
      Black
      Male
      0
      0
      40
      United-States
      0
    
    
      1
      2
      Private
      89814.0
      HS-grad
      0
      Married-civ-spouse
      Farming-fishing
      Husband
      White
      Male
      0
      0
      50
      United-States
      0
    
    
      3
      3
      Private
      160323.0
      Some-college
      1
      Married-civ-spouse
      Machine-op-inspct
      Husband
      Black
      Male
      7688
      0
      40
      United-States
      1
    
    
      4
      0
      Others
      103497.0
      Some-college
      1
      Never-married
      Others
      Own-child
      White
      Female
      0
      0
      30
      United-States
      0
    
    
      5
      2
      Private
      198693.0
      10th
      0
      Never-married
      Other-service
      Not-in-family
      White
      Male
      0
      0
      30
      United-States
      0



In [53]:

    
df.drop(['Capital Gain', 'Capital Loss'], axis = 1, inplace = True)
df.head()









    Out[53]:






  
    
      
      Age
      Work Class
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      0
      0
      Private
      226802.0
      11th
      0
      Never-married
      Machine-op-inspct
      Own-child
      Black
      Male
      40
      United-States
      0
    
    
      1
      2
      Private
      89814.0
      HS-grad
      0
      Married-civ-spouse
      Farming-fishing
      Husband
      White
      Male
      50
      United-States
      0
    
    
      3
      3
      Private
      160323.0
      Some-college
      1
      Married-civ-spouse
      Machine-op-inspct
      Husband
      Black
      Male
      40
      United-States
      1
    
    
      4
      0
      Others
      103497.0
      Some-college
      1
      Never-married
      Others
      Own-child
      White
      Female
      30
      United-States
      0
    
    
      5
      2
      Private
      198693.0
      10th
      0
      Never-married
      Other-service
      Not-in-family
      White
      Male
      30
      United-States
      0

Feature Evaluation



In [54]:

    
from scipy.stats import norm, entropy

s = df['APPROVE/NOT'].value_counts()
approve_rate = s / float(sum(s))

hy = entropy(approve_rate)
print hy









    



0.545816229217



In [55]:

    
def HY_X(py_x, px):
    ret = 0
    for i in range(len(px)):
        # print px[i], py_x[i]
        ret += px[i] * entropy([py_x[i], 1 - py_x[i]])
    return ret

def NE(hx, hy, hy_x):
    return (hy - hy_x) / (hx + hy)



In [56]:

    
cols = list(df.columns.values)
cols.remove("APPROVE/NOT")
cols.remove("FnlWgt")
cols.remove("hours per wk")

print cols

for column_name in cols:
  a = df['APPROVE/NOT'].groupby(df[column_name]).sum()
  b = df['APPROVE/NOT'].groupby(df[column_name]).count()
  py_x = a.div(b)

  px = df[column_name].value_counts() / float(df[column_name].count())
  hx = entropy(px)

  hy_x = HY_X(py_x, px)

  ne = NE(hx, hy, hy_x)
    
  print "%-15s %-.5d, %-.5f  %-.5f" % (column_name, hx, hy_x, ne)









    



['Age', 'Work Class', 'Education', 'Education Num', 'Maried Status', 'Occupation', 'Relationship', 'Race', 'Gender', 'Native Country']
Age             00001, 0.48192  0.03069
Work Class      00001, 0.62446  -0.04581
Education       00002, 0.24668  0.11618
Education Num   00001, 0.49695  0.02746
Maried Status   00001, 0.46559  0.04385
Occupation      00002, 0.42615  0.04001
Relationship    00001, 0.41859  0.06225
Race            00000, 0.43712  0.09984
Gender          00000, 0.43686  0.09220
Native Country  00000, 0.54536  0.00046



In [57]:

    
df.drop(['Work Class'], axis = 1, inplace = True)

the first model



In [58]:

    
df1 = df



In [59]:

    
cols = ['Education', 'Maried Status', 'Occupation', 'Race', 'Relationship', 'Gender', 'Native Country']
for col in cols:
  keys = df1[col].unique()
  values = range(1, len(keys) + 1)
  zip_list = zip(keys,values)
  dict1 = dict( (keys,value) for keys,value in zip_list)
  for key, value in dict1.items():
    df1[col].replace(key, value, inplace = True)
    df1[col] = df1[col].astype('category')

df1['APPROVE/NOT'] = df1['APPROVE/NOT'].astype('category')
df1.describe(include = 'all')









    Out[59]:






  
    
      
      Age
      FnlWgt
      Education
      Education Num
      Maried Status
      Occupation
      Relationship
      Race
      Gender
      hours per wk
      Native Country
      APPROVE/NOT
    
  
  
    
      count
      10005.0
      1.000500e+04
      10005.0
      10005.0
      10005.0
      10005.0
      10005.0
      10005.0
      10005.0
      10005.000000
      10005.0
      10005.0
    
    
      unique
      5.0
      NaN
      16.0
      4.0
      7.0
      14.0
      6.0
      5.0
      2.0
      NaN
      9.0
      2.0
    
    
      top
      3.0
      NaN
      2.0
      0.0
      2.0
      5.0
      2.0
      2.0
      1.0
      NaN
      1.0
      0.0
    
    
      freq
      3077.0
      NaN
      3240.0
      4555.0
      4501.0
      1267.0
      3973.0
      8584.0
      6678.0
      NaN
      9033.0
      7649.0
    
    
      mean
      NaN
      1.883437e+05
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      40.466967
      NaN
      NaN
    
    
      std
      NaN
      1.049959e+05
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      12.415272
      NaN
      NaN
    
    
      min
      NaN
      0.000000e+00
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1.000000
      NaN
      NaN
    
    
      25%
      NaN
      1.157710e+05
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      40.000000
      NaN
      NaN
    
    
      50%
      NaN
      1.777750e+05
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      40.000000
      NaN
      NaN
    
    
      75%
      NaN
      2.374980e+05
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      45.000000
      NaN
      NaN
    
    
      max
      NaN
      1.490400e+06
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      99.000000
      NaN
      NaN



In [60]:

    
df1.to_csv('full_data1.csv', index = False, header = False, 
           columns = ['Age', 'Education', 'Education Num', 'Maried Status', 
                      'Occupation', 'Relationship', 'Race', 'Gender', 'Native Country', 
                      'FnlWgt', 'hours per wk', 'APPROVE/NOT'])



In [61]:

    
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from matplotlib import pylab
from collections import defaultdict

label_list = ['not approve', 'approve']
def train_model(clf_factory, X, Y, cv, name, isplot = False):
    labels = np.unique(Y).astype('int')
    print labels

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    weights = [0] * X.shape[1]

    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)
    
        weights = [a + b for (a, b) in zip(weights, clf.coef_[0])]

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    avg_weights = [weight / float(cv.get_n_splits()) for weight in weights]
    print avg_weights

    if isplot:
        for label in labels:
            print "Plotting", label_list[label]
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

            desc = "%s %s" % (name, label_list[label])
            plot_roc(roc_scores[label][median], desc, tprs[label][median],
                     fprs[label][median], label='%s vs rest' % label_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),
               np.mean(all_pr_scores), np.std(all_pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)

def plot_confusion_matrix(cm, label_list, name, title):
    # pylab.clf()
    pylab.figure(num = None, figsize = (5, 4))
    pylab.matshow(cm, fignum = False, cmap = 'Blues', vmin = 0, vmax = 1.0)
    ax = pylab.axes()
    ax.set_xticks(range(len(label_list)))
    ax.set_xticklabels(label_list)
    ax.xaxis.set_ticks_position("bottom")
    ax.set_yticks(range(len(label_list)))
    ax.set_yticklabels(label_list)
    pylab.title(title)
    pylab.colorbar()
    pylab.grid(False)
    pylab.xlabel('Predicted Class')
    pylab.ylabel('True Class')
    pylab.show()

def plot_roc(auc_score, name, tpr, fpr, label=None):
    # pylab.clf()
    pylab.figure(num = None, figsize = (5, 4))
    pylab.grid(True)
    pylab.plot([0, 1], [0, 1], 'k--')
    pylab.plot(fpr, tpr)
    pylab.fill_between(fpr, tpr, alpha=0.5)
    pylab.xlim([0.0, 1.0])
    pylab.ylim([0.0, 1.0])
    pylab.xlabel('False Positive Rate')
    pylab.ylabel('True Positive Rate')
    pylab.title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")
    pylab.legend(loc="lower right")



In [62]:

    
f = open("full_data1.csv")
data = np.loadtxt(f, delimiter = ',')
# data
X1 = data[:, 0:9]
X2 = data[:, 9:11]
Y = data[:, -1]

# one-hot encoder
enc = preprocessing.OneHotEncoder()
enc.fit(X1)
TX1 = enc.transform(X1).toarray()
print TX1.shape[1]

# normalization
# zscore_scaler = preprocessing.StandardScaler()
# TX2 = zscore_scaler.fit_transform(X2)
normalizer = preprocessing.Normalizer().fit(X2)
TX2 = normalizer.transform(X2)
print TX2.shape[1]

# combine together
X = np.concatenate((TX1, TX2), axis = 1)

# cross validation
title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)



In [63]:

    
def create_model():
    from sklearn.linear_model.logistic import LogisticRegression
    clf = LogisticRegression(penalty='l1')
    return clf

train_avg, test_avg, cms = train_model(create_model, X, Y, cv, "Log Reg", isplot = True)

cm_avg = np.mean(cms, axis=0)
cm_norm = cm_avg / np.sum(cm_avg, axis=0)

print cm_norm

plot_confusion_matrix(cm_norm, label_list, "lr", "Confusion Matrix")









    



[0 1]
[-1.8283582531181934, -0.45180803088798349, 0.0, 0.31308152608544643, -0.24892599003963783, -0.084453922420286889, 0.90711870391622773, -0.12779912130831833, -0.057567671666646378, 0.88522502886905197, -0.16377499152573122, -0.26972502502519624, 0.0, 0.24717083673285606, 0.18847847746566426, -0.29956909631849854, -0.11017318397407153, 0.0, 0.403371155122562, -0.063575417749917568, 0.0, -1.5174038439132276, -0.12919501782689671, 0.0, 0.7808435611595258, -0.49764443385647061, 1.8488186108233979, -0.062341317530578831, -0.38377522392637914, -0.034095576183114241, -0.0092489704907613764, 0.33502379973223217, -0.42188511483165553, -0.69757567781541208, -0.83454580681564283, -1.0663477219125792, 0.47864268239754948, -0.0047607812503959969, 0.07772133464273559, 0.83643502849405071, 0.38323494575271322, 0.33281005893468685, 0.22017835397389315, -0.15118204372069982, -0.078229037717548827, -0.48295208429683745, -0.56732973359170058, -0.0076817490553722622, 0.34693741998518701, -0.1429337277018321, 0.649725498088221, -0.79000748087944617, -0.53904702520871162, -0.15240084566259174, 0.0035244613684062426, 0.25552047129054006, -0.25671178969247249, 0.0, -0.4100523860632152, 0.16631798947853388, -0.15094389562823279, -1.0144956447371558, -0.94746614025394982, 0.0, -0.038110633552318793, -0.025495876923457665, 0.13216583532595835, 0.03225591173812712, -0.37188352686399612, 0.0]
Plotting not approve
Plotting approve
0.826	0.008	0.819	0.143	
[[ 0.86295752  0.33714597]
 [ 0.13704248  0.66285403]]

Model Complexity



In [64]:

    
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv = cv, n_jobs = n_jobs,
        train_sizes = train_sizes, scoring = 'roc_auc')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt



In [65]:

    
estimator = LogisticRegression()
plot_learning_curve(estimator, title, X, Y, ylim = (0.7, 1.01), cv = cv, n_jobs = 4)

estimator = LogisticRegression(C = 10)
plot_learning_curve(estimator, title, X, Y, ylim = (0.7, 1.01), cv = cv, n_jobs = 4)









    Out[65]:





<module 'matplotlib.pyplot' from '/usr/lib/python2.7/dist-packages/matplotlib/pyplot.pyc'>

	id	Age	Work Class	FnlWgt	Education	Education Num	Maried Status	Occupation	Relationship	Race	Gender	Capital Gain	hours per wk	Native Country	APPROVE/NOT
0	1	25.0	Private	226802.0	11th	7	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	0
1	2	38.0	Private	89814.0	HS-grad	9	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	0
2	3	28.0	Local-gov	336951.0	Assoc-acdm	10000	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	1
3	4	44.0	Private	160323.0	Some-college	10	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	1
4	5	18.0	NaN	103497.0	Some-college	10	Never-married	NaN	Own-child	White	Female	0	30	United-States	0
5	6	34.0	Private	198693.0	10th	6	Never-married	Other-service	Not-in-family	White	Male	0	30	United-States	0
6	7	29.0	NaN	227026.0	HS-grad	9	Never-married	NaN	Unmarried	Black	Male	0	40	United-States	0
7	8	63.0	Self-emp-not-inc	104626.0	Prof-school	15	Married-civ-spouse	Prof-specialty	Husband	White	Male	3103	32	United-States	1
8	9	24.0	Private	369667.0	Some-college	10	Never-married	Other-service	Unmarried	White	Female	0	40	United-States	0
9	10	55.0	Private	104996.0	7th-8th	4	Married-civ-spouse	Craft-repair	Husband	White	Male	0	10	United-States	0

	Age	Work Class	FnlWgt	Education	Education Num	Maried Status	Occupation	Relationship	Race	Gender	Capital Gain	Capital Loss	hours per wk	Native Country	APPROVE/NOT
count	10008.000000	9406	1.000800e+04	10010	10010.000000	10010	9405	10010	10010	10010	10010.000000	10010.000000	10010.000000	9839	10010.000000
unique	NaN	8	NaN	16	NaN	7	14	6	5	2	NaN	NaN	NaN	40	NaN
top	NaN	Private	NaN	HS-grad	NaN	Married-civ-spouse	Prof-specialty	Husband	White	Male	NaN	NaN	NaN	United-States	NaN
freq	NaN	6849	NaN	3241	NaN	4505	1269	3976	8589	6681	NaN	NaN	NaN	9038	NaN
mean	38.759592	NaN	1.883641e+05	NaN	11.070729	NaN	NaN	NaN	NaN	NaN	1139.195205	88.639461	40.471728	NaN	0.235664
std	13.873285	NaN	1.050199e+05	NaN	99.882405	NaN	NaN	NaN	NaN	NaN	7996.332111	405.248276	12.422044	NaN	0.424435
min	17.000000	NaN	0.000000e+00	NaN	1.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	1.000000	NaN	0.000000
25%	28.000000	NaN	1.157595e+05	NaN	9.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	40.000000	NaN	0.000000
50%	37.000000	NaN	1.777905e+05	NaN	10.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	40.000000	NaN	0.000000
75%	48.000000	NaN	2.375080e+05	NaN	12.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	45.000000	NaN	0.000000
max	90.000000	NaN	1.490400e+06	NaN	10000.000000	NaN	NaN	NaN	NaN	NaN	99999.000000	3770.000000	99.000000	NaN	1.000000

	Age	Work Class	FnlWgt	Education	Education Num	Maried Status	Occupation	Relationship	Race	Gender	Capital Gain	Capital Loss	hours per wk	Native Country	APPROVE/NOT
count	10006.000000	10006	1.000600e+04	10006	10006.000000	10006	10006	10006	10006	10006	10006.000000	10006.000000	10006.000000	10006	10006.000000
unique	NaN	7	NaN	16	NaN	7	14	6	5	2	NaN	NaN	NaN	9	NaN
top	NaN	Private	NaN	HS-grad	NaN	Married-civ-spouse	Prof-specialty	Husband	White	Male	NaN	NaN	NaN	United-States	NaN
freq	NaN	6847	NaN	3240	NaN	4502	1267	3974	8585	6679	NaN	NaN	NaN	9034	NaN
mean	38.760344	NaN	1.883586e+05	NaN	11.070358	NaN	NaN	NaN	NaN	NaN	1138.921247	88.674895	40.466920	NaN	0.235559
std	13.873849	NaN	1.050012e+05	NaN	99.902352	NaN	NaN	NaN	NaN	NaN	7997.669072	405.325401	12.414652	NaN	0.424369
min	17.000000	NaN	0.000000e+00	NaN	1.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	1.000000	NaN	0.000000
25%	28.000000	NaN	1.157790e+05	NaN	9.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	40.000000	NaN	0.000000
50%	37.000000	NaN	1.777905e+05	NaN	10.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	40.000000	NaN	0.000000
75%	48.000000	NaN	2.375018e+05	NaN	12.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	45.000000	NaN	0.000000
99.7%	80.000000	NaN	6.564667e+05	NaN	16.000000	NaN	NaN	NaN	NaN	NaN	99999.000000	2415.000000	98.000000	NaN	1.000000
max	90.000000	NaN	1.490400e+06	NaN	10000.000000	NaN	NaN	NaN	NaN	NaN	99999.000000	3770.000000	99.000000	NaN	1.000000

	Age	FnlWgt	Education	Education Num	Maried Status	Occupation	Relationship	Race	Gender	hours per wk	Native Country	APPROVE/NOT
count	10005.0	1.000500e+04	10005.0	10005.0	10005.0	10005.0	10005.0	10005.0	10005.0	10005.000000	10005.0	10005.0
unique	5.0	NaN	16.0	4.0	7.0	14.0	6.0	5.0	2.0	NaN	9.0	2.0
top	3.0	NaN	2.0	0.0	2.0	5.0	2.0	2.0	1.0	NaN	1.0	0.0
freq	3077.0	NaN	3240.0	4555.0	4501.0	1267.0	3973.0	8584.0	6678.0	NaN	9033.0	7649.0
mean	NaN	1.883437e+05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40.466967	NaN	NaN
std	NaN	1.049959e+05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	12.415272	NaN	NaN
min	NaN	0.000000e+00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.000000	NaN	NaN
25%	NaN	1.157710e+05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40.000000	NaN	NaN
50%	NaN	1.777750e+05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	40.000000	NaN	NaN
75%	NaN	2.374980e+05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	45.000000	NaN	NaN
max	NaN	1.490400e+06	NaN	NaN	NaN	NaN	NaN	NaN	NaN	99.000000	NaN	NaN