In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import KMeans
from sklearn import mixture

from sklearn.cross_validation import train_test_split

%matplotlib inline

first glance


In [35]:
!head ../data/loan_dataset.csv











load data


In [36]:
df = pd.read_csv("../data/loan_dataset.csv", na_values = ['?'])
df.info()
df.head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10010 entries, 0 to 10009
Data columns (total 16 columns):
id                10010 non-null int64
Age               10008 non-null float64
Work Class        9406 non-null object
FnlWgt            10008 non-null float64
Education         10010 non-null object
Education Num     10010 non-null int64
Maried Status     10010 non-null object
Occupation        9405 non-null object
Relationship      10010 non-null object
Race              10010 non-null object
Gender            10010 non-null object
Capital Gain      10010 non-null int64
Capital Loss      10010 non-null int64
hours per wk      10010 non-null int64
Native Country    9839 non-null object
APPROVE/NOT       10010 non-null int64
dtypes: float64(2), int64(6), object(8)
memory usage: 1.2+ MB
Out[36]:
id Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender Capital Gain Capital Loss hours per wk Native Country APPROVE/NOT
0 1 25.0 Private 226802.0 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0
1 2 38.0 Private 89814.0 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0
2 3 28.0 Local-gov 336951.0 Assoc-acdm 10000 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1
3 4 44.0 Private 160323.0 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1
4 5 18.0 NaN 103497.0 Some-college 10 Never-married NaN Own-child White Female 0 0 30 United-States 0
5 6 34.0 Private 198693.0 10th 6 Never-married Other-service Not-in-family White Male 0 0 30 United-States 0
6 7 29.0 NaN 227026.0 HS-grad 9 Never-married NaN Unmarried Black Male 0 0 40 United-States 0
7 8 63.0 Self-emp-not-inc 104626.0 Prof-school 15 Married-civ-spouse Prof-specialty Husband White Male 3103 0 32 United-States 1
8 9 24.0 Private 369667.0 Some-college 10 Never-married Other-service Unmarried White Female 0 0 40 United-States 0
9 10 55.0 Private 104996.0 7th-8th 4 Married-civ-spouse Craft-repair Husband White Male 0 0 10 United-States 0

In [37]:
s = df['APPROVE/NOT'].value_counts()
approve_rate = s / float(sum(s))
print("apprive rate:%.3f%%"  %(approve_rate[1] * 100))


apprive rate:23.566%

aata wrangling

remove useless column


In [38]:
df.drop(df.columns[[0]], axis = 1, inplace = True)
df.info()
df.head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10010 entries, 0 to 10009
Data columns (total 15 columns):
Age               10008 non-null float64
Work Class        9406 non-null object
FnlWgt            10008 non-null float64
Education         10010 non-null object
Education Num     10010 non-null int64
Maried Status     10010 non-null object
Occupation        9405 non-null object
Relationship      10010 non-null object
Race              10010 non-null object
Gender            10010 non-null object
Capital Gain      10010 non-null int64
Capital Loss      10010 non-null int64
hours per wk      10010 non-null int64
Native Country    9839 non-null object
APPROVE/NOT       10010 non-null int64
dtypes: float64(2), int64(5), object(8)
memory usage: 1.1+ MB
Out[38]:
Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender Capital Gain Capital Loss hours per wk Native Country APPROVE/NOT
0 25.0 Private 226802.0 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0
1 38.0 Private 89814.0 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0
2 28.0 Local-gov 336951.0 Assoc-acdm 10000 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1
3 44.0 Private 160323.0 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1
4 18.0 NaN 103497.0 Some-college 10 Never-married NaN Own-child White Female 0 0 30 United-States 0
5 34.0 Private 198693.0 10th 6 Never-married Other-service Not-in-family White Male 0 0 30 United-States 0
6 29.0 NaN 227026.0 HS-grad 9 Never-married NaN Unmarried Black Male 0 0 40 United-States 0
7 63.0 Self-emp-not-inc 104626.0 Prof-school 15 Married-civ-spouse Prof-specialty Husband White Male 3103 0 32 United-States 1
8 24.0 Private 369667.0 Some-college 10 Never-married Other-service Unmarried White Female 0 0 40 United-States 0
9 55.0 Private 104996.0 7th-8th 4 Married-civ-spouse Craft-repair Husband White Male 0 0 10 United-States 0

In [39]:
df.describe(include = 'all')


Out[39]:
Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender Capital Gain Capital Loss hours per wk Native Country APPROVE/NOT
count 10008.000000 9406 1.000800e+04 10010 10010.000000 10010 9405 10010 10010 10010 10010.000000 10010.000000 10010.000000 9839 10010.000000
unique NaN 8 NaN 16 NaN 7 14 6 5 2 NaN NaN NaN 40 NaN
top NaN Private NaN HS-grad NaN Married-civ-spouse Prof-specialty Husband White Male NaN NaN NaN United-States NaN
freq NaN 6849 NaN 3241 NaN 4505 1269 3976 8589 6681 NaN NaN NaN 9038 NaN
mean 38.759592 NaN 1.883641e+05 NaN 11.070729 NaN NaN NaN NaN NaN 1139.195205 88.639461 40.471728 NaN 0.235664
std 13.873285 NaN 1.050199e+05 NaN 99.882405 NaN NaN NaN NaN NaN 7996.332111 405.248276 12.422044 NaN 0.424435
min 17.000000 NaN 0.000000e+00 NaN 1.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 1.000000 NaN 0.000000
25% 28.000000 NaN 1.157595e+05 NaN 9.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 40.000000 NaN 0.000000
50% 37.000000 NaN 1.777905e+05 NaN 10.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 40.000000 NaN 0.000000
75% 48.000000 NaN 2.375080e+05 NaN 12.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 45.000000 NaN 0.000000
max 90.000000 NaN 1.490400e+06 NaN 10000.000000 NaN NaN NaN NaN NaN 99999.000000 3770.000000 99.000000 NaN 1.000000

missing value handling


In [40]:
def count_missing(x):
  return sum(x.isnull()) / float(len(x))

print "Missing Value Statistics"
print df.apply(count_missing, axis = 0)


Missing Value Statistics
Age               0.000200
Work Class        0.060340
FnlWgt            0.000200
Education         0.000000
Education Num     0.000000
Maried Status     0.000000
Occupation        0.060440
Relationship      0.000000
Race              0.000000
Gender            0.000000
Capital Gain      0.000000
Capital Loss      0.000000
hours per wk      0.000000
Native Country    0.017083
APPROVE/NOT       0.000000
dtype: float64

In [41]:
df.fillna(value = {'Native Country' : 'Others'}, inplace = True)
df.fillna(value = {'Occupation' : 'Others'}, inplace = True)
df.fillna(value = {'Work Class' : 'Others'}, inplace = True)

df.head()


Out[41]:
Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender Capital Gain Capital Loss hours per wk Native Country APPROVE/NOT
0 25.0 Private 226802.0 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0
1 38.0 Private 89814.0 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0
2 28.0 Local-gov 336951.0 Assoc-acdm 10000 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1
3 44.0 Private 160323.0 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1
4 18.0 Others 103497.0 Some-college 10 Never-married Others Own-child White Female 0 0 30 United-States 0

In [42]:
df.dropna(inplace = True)

In [43]:
print df.apply(count_missing, axis = 0)


Age               0.0
Work Class        0.0
FnlWgt            0.0
Education         0.0
Education Num     0.0
Maried Status     0.0
Occupation        0.0
Relationship      0.0
Race              0.0
Gender            0.0
Capital Gain      0.0
Capital Loss      0.0
hours per wk      0.0
Native Country    0.0
APPROVE/NOT       0.0
dtype: float64

outlier handling


In [44]:
print len(df['Native Country'].unique())
country_cnt = df['Native Country'].value_counts(sort = True)

for (key, val) in country_cnt.iteritems():
    if val < 30:
        df['Native Country'].replace(key, 'Others', inplace = True)
country_cnt = df['Native Country'].value_counts(sort = True)
country_cnt


41
Out[44]:
United-States    9034
Others            539
Mexico            193
Philippines        53
Puerto-Rico        44
Germany            41
India              36
Canada             35
El-Salvador        31
Name: Native Country, dtype: int64

In [45]:
print len(df['Occupation'].unique())
occupation_cnt = df['Occupation'].value_counts(sort = True)
occupation_cnt

for (key, val) in occupation_cnt.iteritems():
    if val < 30:
        df['Occupation'].replace(key, 'Others', inplace = True)
occupation_cnt = df['Occupation'].value_counts(sort = True)
occupation_cnt


15
Out[45]:
Prof-specialty       1267
Exec-managerial      1266
Craft-repair         1230
Adm-clerical         1118
Sales                1111
Other-service        1006
Machine-op-inspct     645
Others                608
Transport-moving      448
Handlers-cleaners     436
Farming-fishing       316
Tech-support          297
Protective-serv       199
Priv-house-serv        59
Name: Occupation, dtype: int64

In [46]:
print len(df['Work Class'].unique())
work_cnt = df['Work Class'].value_counts(sort = True)
work_cnt

for (key, val) in work_cnt.iteritems():
    if val < 30:
        df['Work Class'].replace(key, 'Others', inplace = True)
work_cnt = df['Work Class'].value_counts(sort = True)
work_cnt


9
Out[46]:
Private             6847
Self-emp-not-inc     814
Local-gov            633
Others               609
State-gov            445
Self-emp-inc         369
Federal-gov          289
Name: Work Class, dtype: int64

In [47]:
df.describe(include = 'all', percentiles = [0.25, 0.5, 0.75, 0.997])


Out[47]:
Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender Capital Gain Capital Loss hours per wk Native Country APPROVE/NOT
count 10006.000000 10006 1.000600e+04 10006 10006.000000 10006 10006 10006 10006 10006 10006.000000 10006.000000 10006.000000 10006 10006.000000
unique NaN 7 NaN 16 NaN 7 14 6 5 2 NaN NaN NaN 9 NaN
top NaN Private NaN HS-grad NaN Married-civ-spouse Prof-specialty Husband White Male NaN NaN NaN United-States NaN
freq NaN 6847 NaN 3240 NaN 4502 1267 3974 8585 6679 NaN NaN NaN 9034 NaN
mean 38.760344 NaN 1.883586e+05 NaN 11.070358 NaN NaN NaN NaN NaN 1138.921247 88.674895 40.466920 NaN 0.235559
std 13.873849 NaN 1.050012e+05 NaN 99.902352 NaN NaN NaN NaN NaN 7997.669072 405.325401 12.414652 NaN 0.424369
min 17.000000 NaN 0.000000e+00 NaN 1.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 1.000000 NaN 0.000000
25% 28.000000 NaN 1.157790e+05 NaN 9.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 40.000000 NaN 0.000000
50% 37.000000 NaN 1.777905e+05 NaN 10.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 40.000000 NaN 0.000000
75% 48.000000 NaN 2.375018e+05 NaN 12.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 45.000000 NaN 0.000000
99.7% 80.000000 NaN 6.564667e+05 NaN 16.000000 NaN NaN NaN NaN NaN 99999.000000 2415.000000 98.000000 NaN 1.000000
max 90.000000 NaN 1.490400e+06 NaN 10000.000000 NaN NaN NaN NaN NaN 99999.000000 3770.000000 99.000000 NaN 1.000000

In [48]:
age_list = df.Age.tolist()
cats = pd.qcut(age_list, 5)
cats


Out[48]:
[[17, 26], (33, 41], (26, 33], (41, 51], [17, 26], ..., (26, 33], (51, 90], (41, 51], (51, 90], (41, 51]]
Length: 10006
Categories (5, object): [[17, 26] < (26, 33] < (33, 41] < (41, 51] < (51, 90]]

In [49]:
def age_bin(x):
    if x <= 26:
        return 0
    elif x > 26 and x <= 33:
        return 1
    elif x > 33 and x <= 41:
        return 2
    elif x > 41 and x <= 60:
        return 3
    else:
        return 4
df.Age = df['Age'].apply(age_bin)
df['Age'] = df['Age'].astype('category')
df.head()


Out[49]:
Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender Capital Gain Capital Loss hours per wk Native Country APPROVE/NOT
0 0 Private 226802.0 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0
1 2 Private 89814.0 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0
2 1 Local-gov 336951.0 Assoc-acdm 10000 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States 1
3 3 Private 160323.0 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1
4 0 Others 103497.0 Some-college 10 Never-married Others Own-child White Female 0 0 30 United-States 0

In [50]:
df = df[df['Education Num'] <= 16]
df['Education Num'].hist(bins = 10)


Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd46e23b4d0>

In [51]:
edu_list = df['Education Num'].tolist()
edu_cats = pd.qcut(edu_list, 4)
edu_cats


Out[51]:
[[1, 9], [1, 9], (9, 10], (9, 10], [1, 9], ..., [1, 9], (9, 10], [1, 9], [1, 9], (10, 12]]
Length: 10005
Categories (4, object): [[1, 9] < (9, 10] < (10, 12] < (12, 16]]

In [52]:
def edu_bin(x):
    if x <= 9:
        return 0
    elif x > 9 and x <= 10:
        return 1
    elif x > 10 and x <= 12:
        return 2
    elif x > 12:
        return 3
df['Education Num'] = df['Education Num'].apply(edu_bin)
df['Education Num'] = df['Education Num'].astype('category')
df.head()


Out[52]:
Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender Capital Gain Capital Loss hours per wk Native Country APPROVE/NOT
0 0 Private 226802.0 11th 0 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States 0
1 2 Private 89814.0 HS-grad 0 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States 0
3 3 Private 160323.0 Some-college 1 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States 1
4 0 Others 103497.0 Some-college 1 Never-married Others Own-child White Female 0 0 30 United-States 0
5 2 Private 198693.0 10th 0 Never-married Other-service Not-in-family White Male 0 0 30 United-States 0

In [53]:
df.drop(['Capital Gain', 'Capital Loss'], axis = 1, inplace = True)
df.head()


Out[53]:
Age Work Class FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender hours per wk Native Country APPROVE/NOT
0 0 Private 226802.0 11th 0 Never-married Machine-op-inspct Own-child Black Male 40 United-States 0
1 2 Private 89814.0 HS-grad 0 Married-civ-spouse Farming-fishing Husband White Male 50 United-States 0
3 3 Private 160323.0 Some-college 1 Married-civ-spouse Machine-op-inspct Husband Black Male 40 United-States 1
4 0 Others 103497.0 Some-college 1 Never-married Others Own-child White Female 30 United-States 0
5 2 Private 198693.0 10th 0 Never-married Other-service Not-in-family White Male 30 United-States 0

Feature Evaluation


In [54]:
from scipy.stats import norm, entropy

s = df['APPROVE/NOT'].value_counts()
approve_rate = s / float(sum(s))

hy = entropy(approve_rate)
print hy


0.545816229217

In [55]:
def HY_X(py_x, px):
    ret = 0
    for i in range(len(px)):
        # print px[i], py_x[i]
        ret += px[i] * entropy([py_x[i], 1 - py_x[i]])
    return ret

def NE(hx, hy, hy_x):
    return (hy - hy_x) / (hx + hy)

In [56]:
cols = list(df.columns.values)
cols.remove("APPROVE/NOT")
cols.remove("FnlWgt")
cols.remove("hours per wk")

print cols

for column_name in cols:
  a = df['APPROVE/NOT'].groupby(df[column_name]).sum()
  b = df['APPROVE/NOT'].groupby(df[column_name]).count()
  py_x = a.div(b)

  px = df[column_name].value_counts() / float(df[column_name].count())
  hx = entropy(px)

  hy_x = HY_X(py_x, px)

  ne = NE(hx, hy, hy_x)
    
  print "%-15s %-.5d, %-.5f  %-.5f" % (column_name, hx, hy_x, ne)


['Age', 'Work Class', 'Education', 'Education Num', 'Maried Status', 'Occupation', 'Relationship', 'Race', 'Gender', 'Native Country']
Age             00001, 0.48192  0.03069
Work Class      00001, 0.62446  -0.04581
Education       00002, 0.24668  0.11618
Education Num   00001, 0.49695  0.02746
Maried Status   00001, 0.46559  0.04385
Occupation      00002, 0.42615  0.04001
Relationship    00001, 0.41859  0.06225
Race            00000, 0.43712  0.09984
Gender          00000, 0.43686  0.09220
Native Country  00000, 0.54536  0.00046

In [57]:
df.drop(['Work Class'], axis = 1, inplace = True)

the first model


In [58]:
df1 = df

In [59]:
cols = ['Education', 'Maried Status', 'Occupation', 'Race', 'Relationship', 'Gender', 'Native Country']
for col in cols:
  keys = df1[col].unique()
  values = range(1, len(keys) + 1)
  zip_list = zip(keys,values)
  dict1 = dict( (keys,value) for keys,value in zip_list)
  for key, value in dict1.items():
    df1[col].replace(key, value, inplace = True)
    df1[col] = df1[col].astype('category')

df1['APPROVE/NOT'] = df1['APPROVE/NOT'].astype('category')
df1.describe(include = 'all')


Out[59]:
Age FnlWgt Education Education Num Maried Status Occupation Relationship Race Gender hours per wk Native Country APPROVE/NOT
count 10005.0 1.000500e+04 10005.0 10005.0 10005.0 10005.0 10005.0 10005.0 10005.0 10005.000000 10005.0 10005.0
unique 5.0 NaN 16.0 4.0 7.0 14.0 6.0 5.0 2.0 NaN 9.0 2.0
top 3.0 NaN 2.0 0.0 2.0 5.0 2.0 2.0 1.0 NaN 1.0 0.0
freq 3077.0 NaN 3240.0 4555.0 4501.0 1267.0 3973.0 8584.0 6678.0 NaN 9033.0 7649.0
mean NaN 1.883437e+05 NaN NaN NaN NaN NaN NaN NaN 40.466967 NaN NaN
std NaN 1.049959e+05 NaN NaN NaN NaN NaN NaN NaN 12.415272 NaN NaN
min NaN 0.000000e+00 NaN NaN NaN NaN NaN NaN NaN 1.000000 NaN NaN
25% NaN 1.157710e+05 NaN NaN NaN NaN NaN NaN NaN 40.000000 NaN NaN
50% NaN 1.777750e+05 NaN NaN NaN NaN NaN NaN NaN 40.000000 NaN NaN
75% NaN 2.374980e+05 NaN NaN NaN NaN NaN NaN NaN 45.000000 NaN NaN
max NaN 1.490400e+06 NaN NaN NaN NaN NaN NaN NaN 99.000000 NaN NaN

In [60]:
df1.to_csv('full_data1.csv', index = False, header = False, 
           columns = ['Age', 'Education', 'Education Num', 'Maried Status', 
                      'Occupation', 'Relationship', 'Race', 'Gender', 'Native Country', 
                      'FnlWgt', 'hours per wk', 'APPROVE/NOT'])

In [61]:
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from matplotlib import pylab
from collections import defaultdict

label_list = ['not approve', 'approve']
def train_model(clf_factory, X, Y, cv, name, isplot = False):
    labels = np.unique(Y).astype('int')
    print labels

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    weights = [0] * X.shape[1]

    for train, test in cv.split(X):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)
    
        weights = [a + b for (a, b) in zip(weights, clf.coef_[0])]

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    avg_weights = [weight / float(cv.get_n_splits()) for weight in weights]
    print avg_weights

    if isplot:
        for label in labels:
            print "Plotting", label_list[label]
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

            desc = "%s %s" % (name, label_list[label])
            plot_roc(roc_scores[label][median], desc, tprs[label][median],
                     fprs[label][median], label='%s vs rest' % label_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),
               np.mean(all_pr_scores), np.std(all_pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)

def plot_confusion_matrix(cm, label_list, name, title):
    # pylab.clf()
    pylab.figure(num = None, figsize = (5, 4))
    pylab.matshow(cm, fignum = False, cmap = 'Blues', vmin = 0, vmax = 1.0)
    ax = pylab.axes()
    ax.set_xticks(range(len(label_list)))
    ax.set_xticklabels(label_list)
    ax.xaxis.set_ticks_position("bottom")
    ax.set_yticks(range(len(label_list)))
    ax.set_yticklabels(label_list)
    pylab.title(title)
    pylab.colorbar()
    pylab.grid(False)
    pylab.xlabel('Predicted Class')
    pylab.ylabel('True Class')
    pylab.show()

def plot_roc(auc_score, name, tpr, fpr, label=None):
    # pylab.clf()
    pylab.figure(num = None, figsize = (5, 4))
    pylab.grid(True)
    pylab.plot([0, 1], [0, 1], 'k--')
    pylab.plot(fpr, tpr)
    pylab.fill_between(fpr, tpr, alpha=0.5)
    pylab.xlim([0.0, 1.0])
    pylab.ylim([0.0, 1.0])
    pylab.xlabel('False Positive Rate')
    pylab.ylabel('True Positive Rate')
    pylab.title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")
    pylab.legend(loc="lower right")

In [62]:
f = open("full_data1.csv")
data = np.loadtxt(f, delimiter = ',')
# data
X1 = data[:, 0:9]
X2 = data[:, 9:11]
Y = data[:, -1]

# one-hot encoder
enc = preprocessing.OneHotEncoder()
enc.fit(X1)
TX1 = enc.transform(X1).toarray()
print TX1.shape[1]

# normalization
# zscore_scaler = preprocessing.StandardScaler()
# TX2 = zscore_scaler.fit_transform(X2)
normalizer = preprocessing.Normalizer().fit(X2)
TX2 = normalizer.transform(X2)
print TX2.shape[1]

# combine together
X = np.concatenate((TX1, TX2), axis = 1)

# cross validation
title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)


68
2

In [63]:
def create_model():
    from sklearn.linear_model.logistic import LogisticRegression
    clf = LogisticRegression(penalty='l1')
    return clf

train_avg, test_avg, cms = train_model(create_model, X, Y, cv, "Log Reg", isplot = True)

cm_avg = np.mean(cms, axis=0)
cm_norm = cm_avg / np.sum(cm_avg, axis=0)

print cm_norm

plot_confusion_matrix(cm_norm, label_list, "lr", "Confusion Matrix")


[0 1]
[-1.8283582531181934, -0.45180803088798349, 0.0, 0.31308152608544643, -0.24892599003963783, -0.084453922420286889, 0.90711870391622773, -0.12779912130831833, -0.057567671666646378, 0.88522502886905197, -0.16377499152573122, -0.26972502502519624, 0.0, 0.24717083673285606, 0.18847847746566426, -0.29956909631849854, -0.11017318397407153, 0.0, 0.403371155122562, -0.063575417749917568, 0.0, -1.5174038439132276, -0.12919501782689671, 0.0, 0.7808435611595258, -0.49764443385647061, 1.8488186108233979, -0.062341317530578831, -0.38377522392637914, -0.034095576183114241, -0.0092489704907613764, 0.33502379973223217, -0.42188511483165553, -0.69757567781541208, -0.83454580681564283, -1.0663477219125792, 0.47864268239754948, -0.0047607812503959969, 0.07772133464273559, 0.83643502849405071, 0.38323494575271322, 0.33281005893468685, 0.22017835397389315, -0.15118204372069982, -0.078229037717548827, -0.48295208429683745, -0.56732973359170058, -0.0076817490553722622, 0.34693741998518701, -0.1429337277018321, 0.649725498088221, -0.79000748087944617, -0.53904702520871162, -0.15240084566259174, 0.0035244613684062426, 0.25552047129054006, -0.25671178969247249, 0.0, -0.4100523860632152, 0.16631798947853388, -0.15094389562823279, -1.0144956447371558, -0.94746614025394982, 0.0, -0.038110633552318793, -0.025495876923457665, 0.13216583532595835, 0.03225591173812712, -0.37188352686399612, 0.0]
Plotting not approve
Plotting approve
0.826	0.008	0.819	0.143	
[[ 0.86295752  0.33714597]
 [ 0.13704248  0.66285403]]

Model Complexity


In [64]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv = cv, n_jobs = n_jobs,
        train_sizes = train_sizes, scoring = 'roc_auc')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [65]:
estimator = LogisticRegression()
plot_learning_curve(estimator, title, X, Y, ylim = (0.7, 1.01), cv = cv, n_jobs = 4)

estimator = LogisticRegression(C = 10)
plot_learning_curve(estimator, title, X, Y, ylim = (0.7, 1.01), cv = cv, n_jobs = 4)


Out[65]:
<module 'matplotlib.pyplot' from '/usr/lib/python2.7/dist-packages/matplotlib/pyplot.pyc'>