notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# This enables inline Plots
%matplotlib inline

pd.set_option('display.max_rows', 10)



In [2]:

    
credit = pd.read_csv('../data/credit-screening/crx_data.csv', header = None, 
                     names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16'])



In [3]:

    
print credit









    



    A1     A2      A3 A4 A5  A6  A7    A8 A9 A10  A11 A12 A13    A14  A15 A16
0    b  30.83   0.000  u  g   w   v  1.25  t   t    1   f   g  00202    0   +
1    a  58.67   4.460  u  g   q   h  3.04  t   t    6   f   g  00043  560   +
2    a  24.50   0.500  u  g   q   h  1.50  t   f    0   f   g  00280  824   +
3    b  27.83   1.540  u  g   w   v  3.75  t   t    5   t   g  00100    3   +
4    b  20.17   5.625  u  g   w   v  1.71  t   f    0   f   s  00120    0   +
..  ..    ...     ... .. ..  ..  ..   ... ..  ..  ...  ..  ..    ...  ...  ..
685  b  21.08  10.085  y  p   e   h  1.25  f   f    0   f   g  00260    0   -
686  a  22.67   0.750  u  g   c   v  2.00  f   t    2   t   g  00200  394   -
687  a  25.25  13.500  y  p  ff  ff  2.00  f   t    1   t   g  00200    1   -
688  b  17.92   0.205  u  g  aa   v  0.04  f   f    0   f   g  00280  750   -
689  b  35.00   3.375  u  g   c   h  8.29  f   f    0   t   g  00000    0   -

[690 rows x 16 columns]



In [4]:

    
credit.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [5]:

    
credit.columns









    Out[5]:





Index([u'A1', u'A2', u'A3', u'A4', u'A5', u'A6', u'A7', u'A8', u'A9', u'A10', u'A11', u'A12', u'A13', u'A14', u'A15', u'A16'], dtype='object')



In [6]:

    
credit.head()



In [7]:

    
credit.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB



In [8]:

    
def recodeA16(sign):
        if sign == '+':
            return 1
        return 0
credit['A16R'] = credit['A16'].map(recodeA16).astype(float)
credit = credit.drop('A16', axis = 1)



In [9]:

    
#credit['A1': 'A3', 'A4': 'A8', 'A9': 'A11', 'A12':'A15'] = credit['A1': 'A3', 'A4': 'A8', 'A9': 'A11', 'A12':'A15'].astype(float)



In [10]:

    
credit.replace('?', np.NaN, inplace = True)



In [11]:

    
credit.A1.unique()









    Out[11]:





array(['b', 'a', nan], dtype=object)



In [12]:

    
credit.drop(['A4', 'A5', 'A6', 'A7'], axis = 1)









    Out[12]:






  
    
      
      A1
      A2
      A3
      A8
      A9
      A10
      A11
      A12
      A13
      A14
      A15
      A16R
    
  
  
    
      0  
       b
       30.83
        0.000
       1.25
       t
       t
       1
       f
       g
       00202
         0
       1
    
    
      1  
       a
       58.67
        4.460
       3.04
       t
       t
       6
       f
       g
       00043
       560
       1
    
    
      2  
       a
       24.50
        0.500
       1.50
       t
       f
       0
       f
       g
       00280
       824
       1
    
    
      3  
       b
       27.83
        1.540
       3.75
       t
       t
       5
       t
       g
       00100
         3
       1
    
    
      4  
       b
       20.17
        5.625
       1.71
       t
       f
       0
       f
       s
       00120
         0
       1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      685
       b
       21.08
       10.085
       1.25
       f
       f
       0
       f
       g
       00260
         0
       0
    
    
      686
       a
       22.67
        0.750
       2.00
       f
       t
       2
       t
       g
       00200
       394
       0
    
    
      687
       a
       25.25
       13.500
       2.00
       f
       t
       1
       t
       g
       00200
         1
       0
    
    
      688
       b
       17.92
        0.205
       0.04
       f
       f
       0
       f
       g
       00280
       750
       0
    
    
      689
       b
       35.00
        3.375
       8.29
       f
       f
       0
       t
       g
       00000
         0
       0
    
  

690 rows × 12 columns



In [13]:

    
credit.A1.unique()









    Out[13]:





array(['b', 'a', nan], dtype=object)



In [14]:

    
def impute_a1(val):
    return np.random.choice(['a', 'b'], p=[0.7, 0.3])



In [15]:

    
credit.A1.value_counts()/credit.A1.size









    Out[15]:





b    0.678261
a    0.304348
dtype: float64



In [16]:

    
def impute_a1(val):
    return np.random.choice(['a', 'b'], p = [0.7, 0.3])



In [17]:

    
credit.A1 = credit.A1.map(impute_a1)



In [18]:

    
credit.A1.unique()









    Out[18]:





array(['b', 'a'], dtype=object)



In [19]:

    
credit.A5.unique()









    Out[19]:





array(['g', 'p', nan, 'gg'], dtype=object)



In [20]:

    
col_dist = {}
def get_col_dist(col_name):
    excl_null_mask = credit[col_name] != '?'
    row_count = credit[excl_null_mask][col_name].size
    col_data = {}
    col_data['prob'] = (credit[excl_null_mask][col_name].value_counts() / row_count).values
    col_data['values'] = (credit[excl_null_mask][col_name].value_counts() / row_count).index.values
    return col_data



In [21]:

    
col_dist['A5'] = get_col_dist('A5')
col_dist['A6'] = get_col_dist('A6')
col_dist['A7'] = get_col_dist('A7')



In [22]:

    
def impute_cols(val, options):
    if val == '?':
        return np.random.choice(options['values'], p=options['prob'])
    return val



In [23]:

    
def impute_a5(val):
    return impute_cols(val, col_dist['A5'])

def impute_a6(val):
    return impute_cols(val, col_dist['A6'])

def impute_a7(val):
    return impute_cols(val, col_dist['A7'])



In [24]:

    
credit.A5 = credit.A5.map(impute_a5)
credit.A6 = credit.A6.map(impute_a6)
credit.A7 = credit.A7.map(impute_a7)
# crx_data.A14 = crx_data.A14.map(impute_a14)



In [25]:

    
credit.A6.unique()
credit.A7.unique()









    Out[25]:





array(['v', 'h', 'bb', 'ff', 'j', 'z', nan, 'o', 'dd', 'n'], dtype=object)



In [26]:

    
credit.A2.unique()









    Out[26]:





array(['30.83', '58.67', '24.50', '27.83', '20.17', '32.08', '33.17',
       '22.92', '54.42', '42.50', '22.08', '29.92', '38.25', '48.08',
       '45.83', '36.67', '28.25', '23.25', '21.83', '19.17', '25.00',
       '47.75', '27.42', '41.17', '15.83', '47.00', '56.58', '57.42',
       '42.08', '29.25', '42.00', '49.50', '36.75', '22.58', '27.25',
       '23.00', '27.75', '54.58', '34.17', '28.92', '29.67', '39.58',
       '56.42', '54.33', '41.00', '31.92', '41.50', '23.92', '25.75',
       '26.00', '37.42', '34.92', '34.25', '23.33', '23.17', '44.33',
       '35.17', '43.25', '56.75', '31.67', '23.42', '20.42', '26.67',
       '36.00', '25.50', '19.42', '32.33', '34.83', '38.58', '44.25',
       '44.83', '20.67', '34.08', '21.67', '21.50', '49.58', '27.67',
       '39.83', nan, '37.17', '25.67', '34.00', '49.00', '62.50', '31.42',
       '52.33', '28.75', '28.58', '22.50', '28.50', '37.50', '35.25',
       '18.67', '54.83', '40.92', '19.75', '29.17', '24.58', '33.75',
       '25.42', '37.75', '52.50', '57.83', '20.75', '39.92', '24.75',
       '44.17', '23.50', '47.67', '22.75', '34.42', '28.42', '67.75',
       '47.42', '36.25', '32.67', '48.58', '33.58', '18.83', '26.92',
       '31.25', '56.50', '43.00', '22.33', '32.83', '40.33', '30.50',
       '52.83', '46.67', '58.33', '37.33', '23.08', '32.75', '68.67',
       '28.00', '44.00', '25.08', '32.00', '60.58', '40.83', '19.33',
       '41.33', '56.00', '49.83', '22.67', '27.00', '26.08', '18.42',
       '21.25', '57.08', '22.42', '48.75', '40.00', '40.58', '28.67',
       '33.08', '21.33', '41.75', '34.50', '48.17', '27.58', '24.08',
       '24.83', '36.33', '35.42', '71.58', '39.50', '39.33', '24.33',
       '60.08', '55.92', '53.92', '18.92', '50.08', '65.42', '17.58',
       '18.08', '19.67', '25.17', '33.50', '58.42', '26.17', '42.83',
       '38.17', '20.50', '48.25', '28.33', '18.75', '18.50', '45.00',
       '40.25', '41.42', '17.83', '18.17', '20.00', '52.17', '50.75',
       '17.08', '18.33', '59.67', '18.00', '37.58', '30.67', '18.58',
       '16.25', '21.17', '17.67', '16.50', '29.50', '21.75', '18.25',
       '35.75', '16.08', '69.17', '32.92', '16.33', '22.17', '57.58',
       '15.92', '31.75', '19.00', '17.50', '33.67', '30.17', '33.25',
       '25.25', '34.75', '47.33', '39.08', '42.75', '38.92', '62.75',
       '32.25', '26.75', '63.33', '30.75', '16.00', '19.50', '32.42',
       '30.25', '26.83', '16.92', '24.42', '39.42', '23.58', '21.42',
       '33.00', '26.33', '26.25', '28.17', '20.83', '43.17', '56.83',
       '15.17', '29.83', '31.00', '51.92', '69.50', '19.58', '22.25',
       '38.42', '26.58', '35.00', '29.42', '49.17', '51.83', '58.58',
       '53.33', '27.17', '25.92', '30.58', '17.25', '27.33', '36.50',
       '29.75', '52.42', '36.17', '34.58', '21.92', '36.58', '31.08',
       '30.42', '21.08', '17.42', '39.17', '26.50', '17.33', '23.75',
       '34.67', '74.83', '45.33', '47.25', '24.17', '39.25', '39.00',
       '64.08', '31.33', '21.00', '13.75', '46.00', '20.25', '60.92',
       '30.00', '22.83', '45.17', '41.58', '55.75', '25.33', '31.83',
       '33.92', '24.92', '80.25', '30.08', '48.33', '76.75', '51.33',
       '41.92', '29.58', '32.17', '51.42', '42.17', '43.08', '59.50',
       '65.17', '20.33', '48.50', '28.08', '73.42', '51.58', '38.67',
       '46.08', '20.08', '42.25', '16.17', '47.83', '22.00', '38.33',
       '25.58', '21.58', '36.08', '38.75', '35.58', '31.58', '15.75',
       '17.92', '30.33', '47.17', '25.83', '50.25', '36.42'], dtype=object)



In [27]:

    
credit.A2 = credit.A2.replace('?', np.nan)



In [28]:

    
credit.A2 = credit.A2.astype(float)



In [29]:

    
credit.A2.hist()









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x1093a1190>



In [30]:

    
a2_missing_vals = np.random.normal(credit.A2.mean(), credit.A2.std(), 12)
credit.loc[credit.A2.isnull(), 'A2'] = a2_missing_vals
credit.A2.isnull().sum()









    Out[30]:





0



In [31]:

    
def impute_numeric_cols(col_data, col_name):
    na_row_count = col_data.isnull().sum()
    impute_vals = np.random.normal(col_data.mean(), col_data.std(), na_row_count)
    return impute_vals
credit['A14'] = credit['A14'].replace('?', np.nan)
credit['A14'] = credit['A14'].astype(float)

na_rows_mask = credit['A14'].isnull()
credit.loc[na_rows_mask, 'A14'] = impute_numeric_cols(credit['A14'], 'A14')



In [32]:

    
credit.head()



In [33]:

    
features = credit.ix[:, 'A1':'A15']
target = credit['A16R']



In [34]:

    
# Build X_data (Features) and y_data (target)
X_data = pd.get_dummies(features)
y_data = target.values



In [35]:

    
X_data









    Out[35]:






  
    
      
      A2
      A3
      A8
      A11
      A14
      A15
      A1_a
      A1_b
      A4_l
      A4_u
      ...
      A7_z
      A9_f
      A9_t
      A10_f
      A10_t
      A12_f
      A12_t
      A13_g
      A13_p
      A13_s
    
  
  
    
      0  
       30.83
        0.000
       1.25
       1
       202
         0
       0
       1
       0
       1
      ...
       0
       0
       1
       0
       1
       1
       0
       1
       0
       0
    
    
      1  
       58.67
        4.460
       3.04
       6
        43
       560
       1
       0
       0
       1
      ...
       0
       0
       1
       0
       1
       1
       0
       1
       0
       0
    
    
      2  
       24.50
        0.500
       1.50
       0
       280
       824
       1
       0
       0
       1
      ...
       0
       0
       1
       1
       0
       1
       0
       1
       0
       0
    
    
      3  
       27.83
        1.540
       3.75
       5
       100
         3
       1
       0
       0
       1
      ...
       0
       0
       1
       0
       1
       0
       1
       1
       0
       0
    
    
      4  
       20.17
        5.625
       1.71
       0
       120
         0
       1
       0
       0
       1
      ...
       0
       0
       1
       1
       0
       1
       0
       0
       0
       1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      685
       21.08
       10.085
       1.25
       0
       260
         0
       1
       0
       0
       0
      ...
       0
       1
       0
       1
       0
       1
       0
       1
       0
       0
    
    
      686
       22.67
        0.750
       2.00
       2
       200
       394
       0
       1
       0
       1
      ...
       0
       1
       0
       0
       1
       0
       1
       1
       0
       0
    
    
      687
       25.25
       13.500
       2.00
       1
       200
         1
       1
       0
       0
       0
      ...
       0
       1
       0
       0
       1
       0
       1
       1
       0
       0
    
    
      688
       17.92
        0.205
       0.04
       0
       280
       750
       1
       0
       0
       1
      ...
       0
       1
       0
       1
       0
       1
       0
       1
       0
       0
    
    
      689
       35.00
        3.375
       8.29
       0
         0
         0
       1
       0
       0
       1
      ...
       0
       1
       0
       1
       0
       0
       1
       1
       0
       0
    
  

690 rows × 46 columns



In [36]:

    
from sklearn import linear_model, tree, metrics, cross_validation, grid_search



In [37]:

    
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_data, y_data, random_state=12, test_size=0.2)



In [38]:

    
clf_lreg = linear_model.LogisticRegression()



In [39]:

    
clf_lreg.fit(X_train, y_train)









    Out[39]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [40]:

    
clf_lreg.score(X_test, y_test)









    Out[40]:





0.8623188405797102



In [ ]:

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
0	b	30.83	0.000	u	g	w	v	1.25	t	t	1	f	g	00202	0	+
1	a	58.67	4.460	u	g	q	h	3.04	t	t	6	f	g	00043	560	+
2	a	24.50	0.500	u	g	q	h	1.50	t	f	0	f	g	00280	824	+
3	b	27.83	1.540	u	g	w	v	3.75	t	t	5	t	g	00100	3	+
4	b	20.17	5.625	u	g	w	v	1.71	t	f	0	f	s	00120	0	+