In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.cluster import KMeans
from sklearn import mixture
from sklearn.cross_validation import train_test_split
%matplotlib inline
In [35]:
!head ../data/loan_dataset.csv
In [36]:
df = pd.read_csv("../data/loan_dataset.csv", na_values = ['?'])
df.info()
df.head(10)
Out[36]:
In [37]:
s = df['APPROVE/NOT'].value_counts()
approve_rate = s / float(sum(s))
print("apprive rate:%.3f%%" %(approve_rate[1] * 100))
In [38]:
df.drop(df.columns[[0]], axis = 1, inplace = True)
df.info()
df.head(10)
Out[38]:
In [39]:
df.describe(include = 'all')
Out[39]:
In [40]:
def count_missing(x):
return sum(x.isnull()) / float(len(x))
print "Missing Value Statistics"
print df.apply(count_missing, axis = 0)
In [41]:
df.fillna(value = {'Native Country' : 'Others'}, inplace = True)
df.fillna(value = {'Occupation' : 'Others'}, inplace = True)
df.fillna(value = {'Work Class' : 'Others'}, inplace = True)
df.head()
Out[41]:
In [42]:
df.dropna(inplace = True)
In [43]:
print df.apply(count_missing, axis = 0)
In [44]:
print len(df['Native Country'].unique())
country_cnt = df['Native Country'].value_counts(sort = True)
for (key, val) in country_cnt.iteritems():
if val < 30:
df['Native Country'].replace(key, 'Others', inplace = True)
country_cnt = df['Native Country'].value_counts(sort = True)
country_cnt
Out[44]:
In [45]:
print len(df['Occupation'].unique())
occupation_cnt = df['Occupation'].value_counts(sort = True)
occupation_cnt
for (key, val) in occupation_cnt.iteritems():
if val < 30:
df['Occupation'].replace(key, 'Others', inplace = True)
occupation_cnt = df['Occupation'].value_counts(sort = True)
occupation_cnt
Out[45]:
In [46]:
print len(df['Work Class'].unique())
work_cnt = df['Work Class'].value_counts(sort = True)
work_cnt
for (key, val) in work_cnt.iteritems():
if val < 30:
df['Work Class'].replace(key, 'Others', inplace = True)
work_cnt = df['Work Class'].value_counts(sort = True)
work_cnt
Out[46]:
In [47]:
df.describe(include = 'all', percentiles = [0.25, 0.5, 0.75, 0.997])
Out[47]:
In [48]:
age_list = df.Age.tolist()
cats = pd.qcut(age_list, 5)
cats
Out[48]:
In [49]:
def age_bin(x):
if x <= 26:
return 0
elif x > 26 and x <= 33:
return 1
elif x > 33 and x <= 41:
return 2
elif x > 41 and x <= 60:
return 3
else:
return 4
df.Age = df['Age'].apply(age_bin)
df['Age'] = df['Age'].astype('category')
df.head()
Out[49]:
In [50]:
df = df[df['Education Num'] <= 16]
df['Education Num'].hist(bins = 10)
Out[50]:
In [51]:
edu_list = df['Education Num'].tolist()
edu_cats = pd.qcut(edu_list, 4)
edu_cats
Out[51]:
In [52]:
def edu_bin(x):
if x <= 9:
return 0
elif x > 9 and x <= 10:
return 1
elif x > 10 and x <= 12:
return 2
elif x > 12:
return 3
df['Education Num'] = df['Education Num'].apply(edu_bin)
df['Education Num'] = df['Education Num'].astype('category')
df.head()
Out[52]:
In [53]:
df.drop(['Capital Gain', 'Capital Loss'], axis = 1, inplace = True)
df.head()
Out[53]:
In [54]:
from scipy.stats import norm, entropy
s = df['APPROVE/NOT'].value_counts()
approve_rate = s / float(sum(s))
hy = entropy(approve_rate)
print hy
In [55]:
def HY_X(py_x, px):
ret = 0
for i in range(len(px)):
# print px[i], py_x[i]
ret += px[i] * entropy([py_x[i], 1 - py_x[i]])
return ret
def NE(hx, hy, hy_x):
return (hy - hy_x) / (hx + hy)
In [56]:
cols = list(df.columns.values)
cols.remove("APPROVE/NOT")
cols.remove("FnlWgt")
cols.remove("hours per wk")
print cols
for column_name in cols:
a = df['APPROVE/NOT'].groupby(df[column_name]).sum()
b = df['APPROVE/NOT'].groupby(df[column_name]).count()
py_x = a.div(b)
px = df[column_name].value_counts() / float(df[column_name].count())
hx = entropy(px)
hy_x = HY_X(py_x, px)
ne = NE(hx, hy, hy_x)
print "%-15s %-.5d, %-.5f %-.5f" % (column_name, hx, hy_x, ne)
In [57]:
df.drop(['Work Class'], axis = 1, inplace = True)
In [58]:
df1 = df
In [59]:
cols = ['Education', 'Maried Status', 'Occupation', 'Race', 'Relationship', 'Gender', 'Native Country']
for col in cols:
keys = df1[col].unique()
values = range(1, len(keys) + 1)
zip_list = zip(keys,values)
dict1 = dict( (keys,value) for keys,value in zip_list)
for key, value in dict1.items():
df1[col].replace(key, value, inplace = True)
df1[col] = df1[col].astype('category')
df1['APPROVE/NOT'] = df1['APPROVE/NOT'].astype('category')
df1.describe(include = 'all')
Out[59]:
In [60]:
df1.to_csv('full_data1.csv', index = False, header = False,
columns = ['Age', 'Education', 'Education Num', 'Maried Status',
'Occupation', 'Relationship', 'Race', 'Gender', 'Native Country',
'FnlWgt', 'hours per wk', 'APPROVE/NOT'])
In [61]:
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from matplotlib import pylab
from collections import defaultdict
label_list = ['not approve', 'approve']
def train_model(clf_factory, X, Y, cv, name, isplot = False):
labels = np.unique(Y).astype('int')
print labels
train_errors = []
test_errors = []
scores = []
pr_scores = defaultdict(list)
precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list)
roc_scores = defaultdict(list)
tprs = defaultdict(list)
fprs = defaultdict(list)
clfs = [] # just to later get the median
cms = []
weights = [0] * X.shape[1]
for train, test in cv.split(X):
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf = clf_factory()
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
scores.append(test_score)
weights = [a + b for (a, b) in zip(weights, clf.coef_[0])]
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cms.append(cm)
for label in labels:
y_label_test = np.asarray(y_test == label, dtype=int)
proba = clf.predict_proba(X_test)
proba_label = proba[:, label]
precision, recall, pr_thresholds = precision_recall_curve(
y_label_test, proba_label)
pr_scores[label].append(auc(recall, precision))
precisions[label].append(precision)
recalls[label].append(recall)
thresholds[label].append(pr_thresholds)
fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
roc_scores[label].append(auc(fpr, tpr))
tprs[label].append(tpr)
fprs[label].append(fpr)
avg_weights = [weight / float(cv.get_n_splits()) for weight in weights]
print avg_weights
if isplot:
for label in labels:
print "Plotting", label_list[label]
scores_to_sort = roc_scores[label]
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
desc = "%s %s" % (name, label_list[label])
plot_roc(roc_scores[label][median], desc, tprs[label][median],
fprs[label][median], label='%s vs rest' % label_list[label])
all_pr_scores = np.asarray(pr_scores.values()).flatten()
summary = (np.mean(scores), np.std(scores),
np.mean(all_pr_scores), np.std(all_pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def plot_confusion_matrix(cm, label_list, name, title):
# pylab.clf()
pylab.figure(num = None, figsize = (5, 4))
pylab.matshow(cm, fignum = False, cmap = 'Blues', vmin = 0, vmax = 1.0)
ax = pylab.axes()
ax.set_xticks(range(len(label_list)))
ax.set_xticklabels(label_list)
ax.xaxis.set_ticks_position("bottom")
ax.set_yticks(range(len(label_list)))
ax.set_yticklabels(label_list)
pylab.title(title)
pylab.colorbar()
pylab.grid(False)
pylab.xlabel('Predicted Class')
pylab.ylabel('True Class')
pylab.show()
def plot_roc(auc_score, name, tpr, fpr, label=None):
# pylab.clf()
pylab.figure(num = None, figsize = (5, 4))
pylab.grid(True)
pylab.plot([0, 1], [0, 1], 'k--')
pylab.plot(fpr, tpr)
pylab.fill_between(fpr, tpr, alpha=0.5)
pylab.xlim([0.0, 1.0])
pylab.ylim([0.0, 1.0])
pylab.xlabel('False Positive Rate')
pylab.ylabel('True Positive Rate')
pylab.title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")
pylab.legend(loc="lower right")
In [62]:
f = open("full_data1.csv")
data = np.loadtxt(f, delimiter = ',')
# data
X1 = data[:, 0:9]
X2 = data[:, 9:11]
Y = data[:, -1]
# one-hot encoder
enc = preprocessing.OneHotEncoder()
enc.fit(X1)
TX1 = enc.transform(X1).toarray()
print TX1.shape[1]
# normalization
# zscore_scaler = preprocessing.StandardScaler()
# TX2 = zscore_scaler.fit_transform(X2)
normalizer = preprocessing.Normalizer().fit(X2)
TX2 = normalizer.transform(X2)
print TX2.shape[1]
# combine together
X = np.concatenate((TX1, TX2), axis = 1)
# cross validation
title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)
In [63]:
def create_model():
from sklearn.linear_model.logistic import LogisticRegression
clf = LogisticRegression(penalty='l1')
return clf
train_avg, test_avg, cms = train_model(create_model, X, Y, cv, "Log Reg", isplot = True)
cm_avg = np.mean(cms, axis=0)
cm_norm = cm_avg / np.sum(cm_avg, axis=0)
print cm_norm
plot_confusion_matrix(cm_norm, label_list, "lr", "Confusion Matrix")
In [64]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv = cv, n_jobs = n_jobs,
train_sizes = train_sizes, scoring = 'roc_auc')
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [65]:
estimator = LogisticRegression()
plot_learning_curve(estimator, title, X, Y, ylim = (0.7, 1.01), cv = cv, n_jobs = 4)
estimator = LogisticRegression(C = 10)
plot_learning_curve(estimator, title, X, Y, ylim = (0.7, 1.01), cv = cv, n_jobs = 4)
Out[65]: