Data exploration

To start with, let us load the dataframe, summarize the columns, and plot a sactter matrix of the data to check for e.g. missing values, non-linear scaling, etc..


In [1]:
import pandas as pd

In [31]:
# Sample code number: id number
#     Clump Thickness: 1 - 10
# 3. Uniformity of Cell Size: 1 - 10
# 4. Uniformity of Cell Shape: 1 - 10
# 5. Marginal Adhesion: 1 - 10
# 6. Single Epithelial Cell Size: 1 - 10
# 7. Bare Nuclei: 1 - 10
# 8. Bland Chromatin: 1 - 10
# 9. Normal Nucleoli: 1 - 10
# 10. Mitoses: 1 - 10
# 11. Class: (2 for benign, 4 for malignant)

    
names = ['sampleid', 'clumpthickness', 'sizeuniformity', 'shapeunformity', 
         'adhesion', 'epithelialsize', 'barenuclei', 'blandchromatin', 'normalnucleoli', 
         'mitoses', 'cellclass'] 

df = pd.read_csv('./breast-cancer-wisconsin.data', names=names)
# df.drop('sampleid')
df.drop('sampleid', axis=1, inplace=True)
df.head(10)

df.cellclass = (df.cellclass == 4).astype(int)

# It turns out one column is a string, but should be an int... 
df.barenuclei = df.barenuclei.values.astype(int)

In [33]:
df.describe()


Out[33]:
clumpthickness sizeuniformity shapeunformity adhesion epithelialsize barenuclei blandchromatin normalnucleoli mitoses cellclass
count 699.000000 699.000000 699.000000 699.000000 699.000000 6.990000e+02 699.000000 699.000000 699.000000 699.000000
mean 4.417740 3.134478 3.207439 2.806867 3.216023 2.005957e+11 3.437768 2.866953 1.589413 0.344778
std 2.815741 3.051459 2.971913 2.855379 2.214300 5.303471e+12 2.438364 3.053634 1.715078 0.475636
min 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000e+00 1.000000 1.000000 1.000000 0.000000
25% 2.000000 1.000000 1.000000 1.000000 2.000000 1.000000e+00 2.000000 1.000000 1.000000 0.000000
50% 4.000000 1.000000 1.000000 1.000000 2.000000 1.000000e+00 3.000000 1.000000 1.000000 0.000000
75% 6.000000 5.000000 5.000000 4.000000 4.000000 5.500000e+00 5.000000 4.000000 1.000000 1.000000
max 10.000000 10.000000 10.000000 10.000000 10.000000 1.402164e+14 10.000000 10.000000 10.000000 1.000000

In [34]:
# Check the class balance.  Turns out to be pretty good so we should have a relatively unbiased view
print 'Num Benign', (df.cellclass==2).sum(), 'Num Malignant', (df.cellclass==4).sum()


Num Benign 0 Num Malignant 0

Scatter matrix.

None of the features appear to require rescaling transformations e.g. on a log-scales...


In [35]:
from pandas.tools.plotting import scatter_matrix
_ = scatter_matrix(df, figsize=(14,14), alpha=.4)



In [ ]:

Constructing a logistic regression classifier

Intriguingly, the logistic


In [148]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn import svm

LR = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, 
                   fit_intercept=True, intercept_scaling=1, 
                   class_weight=None, random_state=None, 
                   solver='liblinear', max_iter=100, 
                   multi_class='ovr', verbose=1, 
                   warm_start=False, n_jobs=1)

X, Y = df.astype(np.float32).get_values()[:,:-1], df.get_values()[:,-1]

X2 = np.append(X,X**2, axis=1)
print X2.shape

LR.fit(X, Y)
print LR.score(X,Y)

C_list = np.logspace(-1, 2, 15)
CV_scores = []
CV_scores2 = [] 
for c in C_list: 
    LR = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=c, 
                   fit_intercept=True, intercept_scaling=1, 
                   class_weight=None, random_state=None, 
                   solver='liblinear', max_iter=100, 
                   multi_class='ovr', verbose=1, 
                   warm_start=False, n_jobs=1)
    CV_scores.append(np.average(cross_validation.cross_val_score(LR, X, Y, cv=6, n_jobs=12)))
    
    svm_class = svm.SVC(C=c, kernel='linear', gamma='auto', coef0=0.0, 
        shrinking=True, probability=False, tol=0.001, cache_size=200, 
        class_weight=None, verbose=False, 
        max_iter=-1, decision_function_shape=None, random_state=None)
    CV_scores2.append(np.average(cross_validation.cross_val_score(svm_class, X, Y, cv=6, n_jobs=12)))


(699, 18)
[LibLinear]0.961373390558
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [147]:
plt.plot(C_list, CV_scores, marker='o', label='Logistic Regression L1 loss')
plt.plot(C_list, CV_scores2, marker='o', label='SVM-Linear')
plt.xscale('log')
plt.xlabel(r'C = 1/$\lambda$')
plt.legend(loc=4)


Out[147]:
<matplotlib.legend.Legend at 0x7f8685322350>

In [128]:
from sklearn.metrics import confusion_matrix

LR = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1e10, 
               fit_intercept=True, intercept_scaling=1, 
               class_weight=None, random_state=None, 
               solver='liblinear', max_iter=100, 
               multi_class='ovr', verbose=1, 
               warm_start=False, n_jobs=1)
LR.fit(X[:300],Y[:300])

svm_class = svm.SVC(C=10., kernel='linear', gamma='auto', coef0=0.0, 
        shrinking=True, probability=True, tol=0.001, cache_size=200, 
        class_weight=None, verbose=False, 
        max_iter=-1, decision_function_shape=None, random_state=None)
svm_class.fit(X[:300],Y[:300])

# Confusion matrix
print 
print 'Confusion Matrix - LASSO Regression'
print confusion_matrix(y_true=Y[300:], y_pred=LR.predict(X[300:]))
print 'Confusion Matrix - SVM-Linear'
print confusion_matrix(y_true=Y[300:], y_pred=svm_class.predict(X[300:]))


[LibLinear]
Confusion Matrix - LASSO Regression
[[288   7]
 [  4 100]]
Confusion Matrix - SVM-Linear
[[290   5]
 [  4 100]]

In [ ]:

Measuring precision/recall and ROC curves


In [140]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve


plt.figure(figsize=(7,2))
plt.subplot(121)
prec, rec, thresh = precision_recall_curve(y_true=Y[300:], probas_pred=LR.predict_proba(X[300:])[:,1])
plt.plot(rec, prec,)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim(0,1)
plt.ylim(0,1)

plt.subplot(122)
fp, tp, thresh = roc_curve(y_true=Y[300:], y_score=LR.predict_proba(X[300:])[:,1])
AUC = roc_auc_score(y_true=Y[300:], y_score=LR.predict_proba(X[300:])[:,1])
roc_curve(y_true=Y[300:], y_score=LR.predict_proba(X[300:])[:,1])
plt.text(.05, .05, 'AUC=%1.3f'%AUC)
plt.plot(fp, tp, linewidth=2)
plt.xlabel('False Positives')
plt.ylabel('True Positives')


Out[140]:
<matplotlib.text.Text at 0x7f8684ff5910>

In [ ]:


In [ ]: