In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# This enables inline Plots
%matplotlib inline

pd.set_option('display.max_rows', 10)

In [2]:
credit = pd.read_csv('../data/credit-screening/crx_data.csv', header = None, 
                     names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16'])

In [3]:
print credit


    A1     A2      A3 A4 A5  A6  A7    A8 A9 A10  A11 A12 A13    A14  A15 A16
0    b  30.83   0.000  u  g   w   v  1.25  t   t    1   f   g  00202    0   +
1    a  58.67   4.460  u  g   q   h  3.04  t   t    6   f   g  00043  560   +
2    a  24.50   0.500  u  g   q   h  1.50  t   f    0   f   g  00280  824   +
3    b  27.83   1.540  u  g   w   v  3.75  t   t    5   t   g  00100    3   +
4    b  20.17   5.625  u  g   w   v  1.71  t   f    0   f   s  00120    0   +
..  ..    ...     ... .. ..  ..  ..   ... ..  ..  ...  ..  ..    ...  ...  ..
685  b  21.08  10.085  y  p   e   h  1.25  f   f    0   f   g  00260    0   -
686  a  22.67   0.750  u  g   c   v  2.00  f   t    2   t   g  00200  394   -
687  a  25.25  13.500  y  p  ff  ff  2.00  f   t    1   t   g  00200    1   -
688  b  17.92   0.205  u  g  aa   v  0.04  f   f    0   f   g  00280  750   -
689  b  35.00   3.375  u  g   c   h  8.29  f   f    0   t   g  00000    0   -

[690 rows x 16 columns]

In [4]:
credit.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [5]:
credit.columns


Out[5]:
Index([u'A1', u'A2', u'A3', u'A4', u'A5', u'A6', u'A7', u'A8', u'A9', u'A10', u'A11', u'A12', u'A13', u'A14', u'A15', u'A16'], dtype='object')

In [6]:
credit.head()


Out[6]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
3 b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
4 b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +

In [7]:
credit.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
A1     690 non-null object
A2     690 non-null object
A3     690 non-null float64
A4     690 non-null object
A5     690 non-null object
A6     690 non-null object
A7     690 non-null object
A8     690 non-null float64
A9     690 non-null object
A10    690 non-null object
A11    690 non-null int64
A12    690 non-null object
A13    690 non-null object
A14    690 non-null object
A15    690 non-null int64
A16    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 91.6+ KB

In [8]:
def recodeA16(sign):
        if sign == '+':
            return 1
        return 0
credit['A16R'] = credit['A16'].map(recodeA16).astype(float)
credit = credit.drop('A16', axis = 1)

In [9]:
#credit['A1': 'A3', 'A4': 'A8', 'A9': 'A11', 'A12':'A15'] = credit['A1': 'A3', 'A4': 'A8', 'A9': 'A11', 'A12':'A15'].astype(float)

In [10]:
credit.replace('?', np.NaN, inplace = True)

In [11]:
credit.A1.unique()


Out[11]:
array(['b', 'a', nan], dtype=object)

In [12]:
credit.drop(['A4', 'A5', 'A6', 'A7'], axis = 1)


Out[12]:
A1 A2 A3 A8 A9 A10 A11 A12 A13 A14 A15 A16R
0 b 30.83 0.000 1.25 t t 1 f g 00202 0 1
1 a 58.67 4.460 3.04 t t 6 f g 00043 560 1
2 a 24.50 0.500 1.50 t f 0 f g 00280 824 1
3 b 27.83 1.540 3.75 t t 5 t g 00100 3 1
4 b 20.17 5.625 1.71 t f 0 f s 00120 0 1
... ... ... ... ... ... ... ... ... ... ... ... ...
685 b 21.08 10.085 1.25 f f 0 f g 00260 0 0
686 a 22.67 0.750 2.00 f t 2 t g 00200 394 0
687 a 25.25 13.500 2.00 f t 1 t g 00200 1 0
688 b 17.92 0.205 0.04 f f 0 f g 00280 750 0
689 b 35.00 3.375 8.29 f f 0 t g 00000 0 0

690 rows × 12 columns


In [13]:
credit.A1.unique()


Out[13]:
array(['b', 'a', nan], dtype=object)

In [14]:
def impute_a1(val):
    return np.random.choice(['a', 'b'], p=[0.7, 0.3])

In [15]:
credit.A1.value_counts()/credit.A1.size


Out[15]:
b    0.678261
a    0.304348
dtype: float64

In [16]:
def impute_a1(val):
    return np.random.choice(['a', 'b'], p = [0.7, 0.3])

In [17]:
credit.A1 = credit.A1.map(impute_a1)

In [18]:
credit.A1.unique()


Out[18]:
array(['b', 'a'], dtype=object)

In [19]:
credit.A5.unique()


Out[19]:
array(['g', 'p', nan, 'gg'], dtype=object)

In [20]:
col_dist = {}
def get_col_dist(col_name):
    excl_null_mask = credit[col_name] != '?'
    row_count = credit[excl_null_mask][col_name].size
    col_data = {}
    col_data['prob'] = (credit[excl_null_mask][col_name].value_counts() / row_count).values
    col_data['values'] = (credit[excl_null_mask][col_name].value_counts() / row_count).index.values
    return col_data

In [21]:
col_dist['A5'] = get_col_dist('A5')
col_dist['A6'] = get_col_dist('A6')
col_dist['A7'] = get_col_dist('A7')

In [22]:
def impute_cols(val, options):
    if val == '?':
        return np.random.choice(options['values'], p=options['prob'])
    return val

In [23]:
def impute_a5(val):
    return impute_cols(val, col_dist['A5'])

def impute_a6(val):
    return impute_cols(val, col_dist['A6'])

def impute_a7(val):
    return impute_cols(val, col_dist['A7'])

In [24]:
credit.A5 = credit.A5.map(impute_a5)
credit.A6 = credit.A6.map(impute_a6)
credit.A7 = credit.A7.map(impute_a7)
# crx_data.A14 = crx_data.A14.map(impute_a14)

In [25]:
credit.A6.unique()
credit.A7.unique()


Out[25]:
array(['v', 'h', 'bb', 'ff', 'j', 'z', nan, 'o', 'dd', 'n'], dtype=object)

In [26]:
credit.A2.unique()


Out[26]:
array(['30.83', '58.67', '24.50', '27.83', '20.17', '32.08', '33.17',
       '22.92', '54.42', '42.50', '22.08', '29.92', '38.25', '48.08',
       '45.83', '36.67', '28.25', '23.25', '21.83', '19.17', '25.00',
       '47.75', '27.42', '41.17', '15.83', '47.00', '56.58', '57.42',
       '42.08', '29.25', '42.00', '49.50', '36.75', '22.58', '27.25',
       '23.00', '27.75', '54.58', '34.17', '28.92', '29.67', '39.58',
       '56.42', '54.33', '41.00', '31.92', '41.50', '23.92', '25.75',
       '26.00', '37.42', '34.92', '34.25', '23.33', '23.17', '44.33',
       '35.17', '43.25', '56.75', '31.67', '23.42', '20.42', '26.67',
       '36.00', '25.50', '19.42', '32.33', '34.83', '38.58', '44.25',
       '44.83', '20.67', '34.08', '21.67', '21.50', '49.58', '27.67',
       '39.83', nan, '37.17', '25.67', '34.00', '49.00', '62.50', '31.42',
       '52.33', '28.75', '28.58', '22.50', '28.50', '37.50', '35.25',
       '18.67', '54.83', '40.92', '19.75', '29.17', '24.58', '33.75',
       '25.42', '37.75', '52.50', '57.83', '20.75', '39.92', '24.75',
       '44.17', '23.50', '47.67', '22.75', '34.42', '28.42', '67.75',
       '47.42', '36.25', '32.67', '48.58', '33.58', '18.83', '26.92',
       '31.25', '56.50', '43.00', '22.33', '32.83', '40.33', '30.50',
       '52.83', '46.67', '58.33', '37.33', '23.08', '32.75', '68.67',
       '28.00', '44.00', '25.08', '32.00', '60.58', '40.83', '19.33',
       '41.33', '56.00', '49.83', '22.67', '27.00', '26.08', '18.42',
       '21.25', '57.08', '22.42', '48.75', '40.00', '40.58', '28.67',
       '33.08', '21.33', '41.75', '34.50', '48.17', '27.58', '24.08',
       '24.83', '36.33', '35.42', '71.58', '39.50', '39.33', '24.33',
       '60.08', '55.92', '53.92', '18.92', '50.08', '65.42', '17.58',
       '18.08', '19.67', '25.17', '33.50', '58.42', '26.17', '42.83',
       '38.17', '20.50', '48.25', '28.33', '18.75', '18.50', '45.00',
       '40.25', '41.42', '17.83', '18.17', '20.00', '52.17', '50.75',
       '17.08', '18.33', '59.67', '18.00', '37.58', '30.67', '18.58',
       '16.25', '21.17', '17.67', '16.50', '29.50', '21.75', '18.25',
       '35.75', '16.08', '69.17', '32.92', '16.33', '22.17', '57.58',
       '15.92', '31.75', '19.00', '17.50', '33.67', '30.17', '33.25',
       '25.25', '34.75', '47.33', '39.08', '42.75', '38.92', '62.75',
       '32.25', '26.75', '63.33', '30.75', '16.00', '19.50', '32.42',
       '30.25', '26.83', '16.92', '24.42', '39.42', '23.58', '21.42',
       '33.00', '26.33', '26.25', '28.17', '20.83', '43.17', '56.83',
       '15.17', '29.83', '31.00', '51.92', '69.50', '19.58', '22.25',
       '38.42', '26.58', '35.00', '29.42', '49.17', '51.83', '58.58',
       '53.33', '27.17', '25.92', '30.58', '17.25', '27.33', '36.50',
       '29.75', '52.42', '36.17', '34.58', '21.92', '36.58', '31.08',
       '30.42', '21.08', '17.42', '39.17', '26.50', '17.33', '23.75',
       '34.67', '74.83', '45.33', '47.25', '24.17', '39.25', '39.00',
       '64.08', '31.33', '21.00', '13.75', '46.00', '20.25', '60.92',
       '30.00', '22.83', '45.17', '41.58', '55.75', '25.33', '31.83',
       '33.92', '24.92', '80.25', '30.08', '48.33', '76.75', '51.33',
       '41.92', '29.58', '32.17', '51.42', '42.17', '43.08', '59.50',
       '65.17', '20.33', '48.50', '28.08', '73.42', '51.58', '38.67',
       '46.08', '20.08', '42.25', '16.17', '47.83', '22.00', '38.33',
       '25.58', '21.58', '36.08', '38.75', '35.58', '31.58', '15.75',
       '17.92', '30.33', '47.17', '25.83', '50.25', '36.42'], dtype=object)

In [27]:
credit.A2 = credit.A2.replace('?', np.nan)

In [28]:
credit.A2 = credit.A2.astype(float)

In [29]:
credit.A2.hist()


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x1093a1190>

In [30]:
a2_missing_vals = np.random.normal(credit.A2.mean(), credit.A2.std(), 12)
credit.loc[credit.A2.isnull(), 'A2'] = a2_missing_vals
credit.A2.isnull().sum()


Out[30]:
0

In [31]:
def impute_numeric_cols(col_data, col_name):
    na_row_count = col_data.isnull().sum()
    impute_vals = np.random.normal(col_data.mean(), col_data.std(), na_row_count)
    return impute_vals
credit['A14'] = credit['A14'].replace('?', np.nan)
credit['A14'] = credit['A14'].astype(float)

na_rows_mask = credit['A14'].isnull()
credit.loc[na_rows_mask, 'A14'] = impute_numeric_cols(credit['A14'], 'A14')

In [32]:
credit.head()


Out[32]:
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16R
0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202 0 1
1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43 560 1
2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280 824 1
3 a 27.83 1.540 u g w v 3.75 t t 5 t g 100 3 1
4 a 20.17 5.625 u g w v 1.71 t f 0 f s 120 0 1

In [33]:
features = credit.ix[:, 'A1':'A15']
target = credit['A16R']

In [34]:
# Build X_data (Features) and y_data (target)
X_data = pd.get_dummies(features)
y_data = target.values

In [35]:
X_data


Out[35]:
A2 A3 A8 A11 A14 A15 A1_a A1_b A4_l A4_u ... A7_z A9_f A9_t A10_f A10_t A12_f A12_t A13_g A13_p A13_s
0 30.83 0.000 1.25 1 202 0 0 1 0 1 ... 0 0 1 0 1 1 0 1 0 0
1 58.67 4.460 3.04 6 43 560 1 0 0 1 ... 0 0 1 0 1 1 0 1 0 0
2 24.50 0.500 1.50 0 280 824 1 0 0 1 ... 0 0 1 1 0 1 0 1 0 0
3 27.83 1.540 3.75 5 100 3 1 0 0 1 ... 0 0 1 0 1 0 1 1 0 0
4 20.17 5.625 1.71 0 120 0 1 0 0 1 ... 0 0 1 1 0 1 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
685 21.08 10.085 1.25 0 260 0 1 0 0 0 ... 0 1 0 1 0 1 0 1 0 0
686 22.67 0.750 2.00 2 200 394 0 1 0 1 ... 0 1 0 0 1 0 1 1 0 0
687 25.25 13.500 2.00 1 200 1 1 0 0 0 ... 0 1 0 0 1 0 1 1 0 0
688 17.92 0.205 0.04 0 280 750 1 0 0 1 ... 0 1 0 1 0 1 0 1 0 0
689 35.00 3.375 8.29 0 0 0 1 0 0 1 ... 0 1 0 1 0 0 1 1 0 0

690 rows × 46 columns


In [36]:
from sklearn import linear_model, tree, metrics, cross_validation, grid_search

In [37]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_data, y_data, random_state=12, test_size=0.2)

In [38]:
clf_lreg = linear_model.LogisticRegression()

In [39]:
clf_lreg.fit(X_train, y_train)


Out[39]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [40]:
clf_lreg.score(X_test, y_test)


Out[40]:
0.8623188405797102

In [ ]: