In [2]:
# Homework 2
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import stats
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC as svc
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV as gs
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.dummy import DummyClassifier
%matplotlib inline
In [6]:
column_names=[['A'+str(i) for i in range(1,17)]
data = pd.read_csv('./crx.data', names=column_names)
In [7]:
data.head()
Out[7]:
In [8]:
data.describe()
Out[8]:
In [9]:
data.info()
In [10]:
# Ok. So now we've got our data, and have taken a look at it.
# I really wish this stuff wasn't so masked out, it feels like gibberish.
with open('./crx.names') as f:
lines=f.readlines()
for line in lines:
print line
In [11]:
# Does data.info() make the correct columns continuous?
# we need 2, 3, 8, 11, 14, 15
data.info()
An old procedure that should certainly be relegated to the past was the idea of substituting a mean for the missing data. For example, if you don't know my systolic blood pressure, just substitute the mean systolic blood pressure for mine and continue. There are a couple of problems with this approach. In the first place it adds no new information. The overall mean, with or without replacing my missing data, will be the same. In addition, such a process leads to an underestimate of error. Cohen et al. (2003) gave an interesting example of a data set on university faculty. The data consisted of data on salary and citation level of publications. There were 62 cases with complete data and 7 cases for which the citation index was missing. Cohen gives the following table.
N r b St. Err. b Analysis
62 .55 310.747 60.95 Complete cases
69 .54 310.747 59.13 Mean substitution
Notice that using mean substitution makes only a trivial change in the correlation coefficient and no change in the regression coefficient. But the st. err (b) is noticeably smaller using mean substitution. That should not be surprising. We have really added no new information to the data but we have increased the sample size. The effect of increasing the sample size is to increase the denominator for computing the standard error, thus reducing the standard error. Adding no new information certainly should not make you more comfortable with the result, but this would seem to. The reduction is spurious and should be avoided--as we'll see below.
So I was reading on best methods for imputation and I came to a bypass... I didn't find a super good source of information on best practices. I sort of wondered how good I could do if I just got rid of the data with missing values? Since we areonly missing 5%, which to me seems like very little, I've proceeded to do the rest of the homework and dropping the rows with missing data.
In [12]:
data.replace('?', np.NaN, inplace=True)
data[['A2', 'A14']] = data[['A2', 'A14']].astype(float)
data.info()
what_if_i_want_to_impute_later = data.copy()
# We're assuming the data is missing at random
clean = data.dropna(axis=0)
clean.info()
In [13]:
data.describe()
Out[13]:
In [14]:
clean.describe()
Out[14]:
In [15]:
# what difference does cleaning make ?
clean.describe() - data.describe()
# not that much
Out[15]:
In [16]:
# But what if it isn't random ? are the missing values correlated with something?
dirty = data[pd.isnull(data).any(axis=1)]
In [17]:
dirty.corr()
Out[17]:
In [18]:
# A11 and A14 are the only possibly correlated.
In [19]:
print stats.pearsonr(data['A11'], data['A15'])
print stats.pearsonr(clean['A11'], clean['A15'])
In [20]:
# Basically we can see here that removing the data doesn't change the correlation much or anything.
In [21]:
from pandas import scatter_matrix
scatter_matrix(clean)
Out[21]:
In [22]:
# change those plus and minus signs
clean.replace('+', '1', axis=1, inplace=True)
clean.replace('-', '0', axis=1, inplace=True)
clean['A16'].describe()
Out[22]:
In [23]:
approval = clean['A16']
data = clean.drop('A16', 1)
data = pd.get_dummies(data)
data.head()
Out[23]:
In [24]:
x_train, x_test, y_train, y_test = train_test_split(data, approval, test_size=0.30)
In [25]:
# Logistc
log_est = LR()
print log_est.fit(x_train, y_train).score(x_test, y_test)
# Damn, thats pretty good.
y_pred = log_est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm
In [26]:
# Linear SVM
est = svc()
est.fit(x_train, y_train)
d = {'C': np.logspace(-3., 3., 10)}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
linear_results = g.best_estimator_, g.best_params_, g.best_score_; linear_results
Out[26]:
In [27]:
# Non linear SVM
est = SVC()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10),
'gamma': np.logspace(-3., 3., 10),
'kernel' : ['sigmoid', 'rbf'],
} # I actually added a few more here but some took too long so I left them out.
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
nonlinear_results = g.best_estimator_, g.best_params_, g.best_score_
In [28]:
nonlinear_results
Out[28]:
In [29]:
# Hmm. Let's try some more
est = SGD()
est.fit(x_train, y_train)
d = {
'loss': ['log', 'perceptron', 'huber', 'epsilon_insensitive'],
'penalty': ['l1', 'elasticnet', 'l2'],
'alpha' : np.logspace(-3., 3., 10),
'epsilon' : np.logspace(-3., 3., 10)
}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
sgd_results = g.best_estimator_, g.best_params_, g.best_score_
In [30]:
sgd_results
Out[30]:
In [31]:
print linear_results[2]
print nonlinear_results[2]
print sgd_results[2]
# It looks like the logistic regression did best actually...
In [32]:
# Linear
est = svc()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10),
'tol': np.logspace(-3., 3., 10),
'dual': [True, False]
}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
linear_results = g.best_estimator_, g.best_params_, g.best_score_; linear_results
Out[32]:
In [33]:
# Adding tol / dual made a notable difference.
# Linear
est = svc()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10),
'tol': np.logspace(-3., 3., 10),
'intercept_scaling': np.logspace(-3., 3., 10)
}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
linear_results = g.best_estimator_, g.best_params_, g.best_score_; linear_results
Out[33]:
In [34]:
# Ok, intercept_scaling is whatever.
# Our best params: 'C': 0.021544346900318832, 'dual': False, 'tol': 0.001
est = svc(C=0.021544346900318832, dual=False, tol=0.001)
est.fit(x_train, y_train)
print est.score(x_test, y_test)
y_pred = est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print cm
print classification_report(y_test, y_pred)
In [35]:
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [36]:
current_palette = sns.color_palette()
sns.palplot(current_palette)
sns.palplot(sns.color_palette("GnBu"))
In [37]:
# Pretty time !
In [38]:
plt.imshow(cm, cmap='GnBu', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm
In [39]:
# Let's use a dummy classifier just to check and see whats up.
dummy_test1 = DummyClassifier(strategy="stratified").fit(x_train, y_train).score(x_test, y_test)
dummy_test2 = DummyClassifier(strategy="most_frequent").fit(x_train, y_train).score(x_test, y_test)
dummy_test3 = DummyClassifier(strategy="uniform").fit(x_train, y_train).score(x_test, y_test)
print dummy_test1, dummy_test2, dummy_test3, est.score(x_test, y_test)
In [40]:
# Look's like we are doing alright.
In [41]:
# SO overall it looks like logistic regression seemed to do the best, even without adjusting parameters.
In [42]:
est = LR()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 10)
}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
log_results = g.best_estimator_, g.best_params_, g.best_score_
In [43]:
log_results
Out[43]:
In [44]:
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm
In [46]:
# Logistic # NO PARAMETERS ADJUSTED
log_est = LR()
print log_est.fit(x_train, y_train).score(x_test, y_test)
# Damn, thats pretty good.
y_pred = log_est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm
In [74]:
# increase interval... by factor... of 10 !
est = LR()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-3., 3., 100)
}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
log_results = g.best_estimator_, g.best_params_, g.best_score_; log_results
Out[74]:
In [75]:
# Ok. 300 intervals, with further min / max
# increase interval... by factor... of 10 !
est = LR()
est.fit(x_train, y_train)
d = {
'C': np.logspace(-4., 4., 20),
'tol': np.logspace(-4., 4., 20)
}
g = gs(est, d)
g.fit(x_train, y_train)
g.score(x_test, y_test)
log_results = g.best_estimator_, g.best_params_, g.best_score_; log_results
Out[75]:
In [51]:
# So basically you should just leave C at 1.0 evidently, haha.
In [73]:
# Logistic # NO PARAMETERS ADJUSTED
log_est = LR()
print log_est.fit(x_train, y_train).score(x_test, y_test)
# Damn, thats pretty good.
y_pred = log_est.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.grid(False)
plt.xlabel('Predicted label')
plt.show()
print cm
In [ ]: