Predicting Malignant Tumors

Wisconsin Diagnostic Beast Cancer Dataset

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

Dataset attributes:

 0. diagnosis (malignant or benign)

 1. radius (mean of distances from center to points on the perimeter)
 2. texture (standard deviation of gray-scale values)
 3. perimeter
 4. area
 5. smoothness (local variation in radius lengths)
 6. compactness (perimeter^2 / area - 1.0)
 7. concavity (severity of concave portions of the contour)
 8. concave points (number of concave portions of the contour)
 9. symmetry 
 10. fractal dimension ("coastline approximation" - 1)



In [1]:

    
%matplotlib inline
from sklearn.decomposition import PCA
import sys
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import seaborn as sns
sns.set_context('talk')



In [2]:

    
#import PCA models

from pandas.tools.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC



In [3]:

    
# load dataset

dfall = pd.read_csv('wdbc.data.txt')
# drop standard error and largest value for each attribute
df = dfall.drop(dfall.columns[[0,3,4,6,7,9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31]],axis=1)
# name columns
df.columns = ['diagnosis','radius','texture','perimeter','area','smoothness','comactness','concavity','concave points','symmetry','fractal dimension']



In [4]:

    
print(df.shape)
print(df.describe())









    



(568, 11)
           radius      texture   perimeter        area  smoothness  \
count  568.000000   568.000000  568.000000  568.000000  568.000000   
mean    14.120491   654.279754    0.088427    0.062770    2.855984   
std      3.523416   351.923751    0.079294    0.007035    2.009288   
min      6.981000   143.500000    0.000000    0.049960    0.757000   
25%     11.697500   420.175000    0.029540    0.057697    1.605000   
50%     13.355000   548.750000    0.061400    0.061515    2.285500   
75%     15.780000   782.625000    0.129650    0.066120    3.336750   
max     28.110000  2501.000000    0.426800    0.097440   21.980000   

       comactness   concavity  concave points    symmetry  fractal dimension  
count  568.000000  568.000000      568.000000  568.000000         568.000000  
mean     0.025437    0.020526       25.691919    0.132316           0.114341  
std      0.017897    0.008264        6.141662    0.022818           0.065484  
min      0.002252    0.007882       12.020000    0.071170           0.000000  
25%      0.013048    0.015128       21.095000    0.116600           0.064730  
50%      0.020435    0.018725       25.425000    0.131300           0.099840  
75%      0.032217    0.023397       29.757500    0.146000           0.161325  
max      0.135400    0.078950       49.540000    0.222600           0.291000



In [5]:

    
print(df.groupby('diagnosis').size())
df.head()









    



diagnosis
B    357
M    211
dtype: int64






    Out[5]:






  
    
      
      diagnosis
      radius
      texture
      perimeter
      area
      smoothness
      comactness
      concavity
      concave points
      symmetry
      fractal dimension
    
  
  
    
      0
      M
      20.57
      1326.0
      0.0869
      0.05667
      3.398
      0.01308
      0.01389
      23.41
      0.1238
      0.1860
    
    
      1
      M
      19.69
      1203.0
      0.1974
      0.05999
      4.585
      0.04006
      0.02250
      25.53
      0.1444
      0.2430
    
    
      2
      M
      11.42
      386.1
      0.2414
      0.09744
      3.445
      0.07458
      0.05963
      26.50
      0.2098
      0.2575
    
    
      3
      M
      20.29
      1297.0
      0.1980
      0.05883
      5.438
      0.02461
      0.01756
      16.67
      0.1374
      0.1625
    
    
      4
      M
      12.45
      477.1
      0.1578
      0.07613
      2.217
      0.03345
      0.02165
      23.75
      0.1791
      0.1741



In [6]:

    
scatter_matrix(df)
plt.show()



In [7]:

    
X = df.ix[:,1:11]
X.tail()









    Out[7]:






  
    
      
      radius
      texture
      perimeter
      area
      smoothness
      comactness
      concavity
      concave points
      symmetry
      fractal dimension
    
  
  
    
      563
      21.56
      1479.0
      0.24390
      0.05623
      7.673
      0.02891
      0.01114
      26.40
      0.14100
      0.2216
    
    
      564
      20.13
      1261.0
      0.14400
      0.05533
      5.203
      0.02423
      0.01898
      38.25
      0.11660
      0.1628
    
    
      565
      16.60
      858.1
      0.09251
      0.05648
      3.425
      0.03731
      0.01318
      34.12
      0.11390
      0.1418
    
    
      566
      20.60
      1265.0
      0.35140
      0.07016
      5.772
      0.06158
      0.02324
      39.42
      0.16500
      0.2650
    
    
      567
      7.76
      181.0
      0.00000
      0.05884
      2.548
      0.00466
      0.02676
      30.37
      0.08996
      0.0000



In [8]:

    
# plot histogram distribution of each attribute
plt.figure(figsize=(16,6))
plt.subplot(2,5,1)
k = 1
for c in X.columns:
    plt.subplot(2,5,k)
    plt.hist(X[c],normed=True,alpha=0.6,bins=20)
    plt.title(c)
    k += 1
plt.tight_layout()

Scaling and Centering



In [9]:

    
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

lbls = X.columns

plt.figure(figsize=(16,6))
plt.subplot(2,5,1)
k = 0
for c in lbls:
    plt.subplot(2,5,k+1)
    plt.hist(X_std[:,k],normed=True,alpha=0.6,bins=20)
    plt.title(c)
    k += 1
plt.tight_layout()

PCA Analysis



In [10]:

    
pca = PCA(n_components=3)
Y = pca.fit_transform(X_std)



In [11]:

    
w = pca.components_
v = pca.explained_variance_ratio_
print(v)

for k in range(0,len(w)):
    plt.subplot(3,1,k+1)
    plt.bar(range(0,len(w[k])),w[k],width=.5)
    plt.xticks(range(0,len(w[k])),lbls)
    plt.title('explained variance ratio = {0:.3f}'.format(v[k]))

plt.tight_layout()









    



[ 0.44692964  0.21996534  0.12748014]



In [12]:

    
k = 0
for n in df['diagnosis']:
    if(df.ix[k,0] == 'M'):
        plt.scatter(Y[k,0],Y[k,1],color='red',alpha=0.4)
    else:
        plt.scatter(Y[k,0],Y[k,1],color='green',alpha=0.4)
    k += 1

Predictive Analysis

Train predictive models to identify malignant tumors and choose the most accurate model to test on a validation set of data



In [13]:

    
# split out validation dataset
diagnosisarray = df.values
dfvals = df.drop(df.columns[[0]],axis=1)
#print(dfvals.head())
array = dfvals.values
X = array[:,0:9]
Y = diagnosisarray[:,0]

validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)



In [14]:

    
#check algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

#evaluate each model
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)









    



LR: 0.929469 (0.040332)
LDA: 0.964589 (0.031734)
KNN: 0.898406 (0.069139)
CART: 0.916280 (0.027693)
NB: 0.922754 (0.028760)
SVM: 0.702560 (0.063240)



In [15]:

    
# compare algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

LDA is the most accurate. Use LDA model to evaluate the validation dataset



In [16]:

    
# make predictions on validation dataset
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
predictions = lda.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))









    



0.956140350877
[[77  1]
 [ 4 32]]
             precision    recall  f1-score   support

          B       0.95      0.99      0.97        78
          M       0.97      0.89      0.93        36

avg / total       0.96      0.96      0.96       114



In [17]:

    
# make predictions on validation dataset
lr = LogisticRegression()
lr.fit(X_train, Y_train)
predictions = lr.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))









    



0.912280701754
[[75  3]
 [ 7 29]]
             precision    recall  f1-score   support

          B       0.91      0.96      0.94        78
          M       0.91      0.81      0.85        36

avg / total       0.91      0.91      0.91       114



In [ ]:

	diagnosis	radius	texture	perimeter	area	smoothness	comactness	concavity	concave points	symmetry	fractal dimension
0	M	20.57	1326.0	0.0869	0.05667	3.398	0.01308	0.01389	23.41	0.1238	0.1860
1	M	19.69	1203.0	0.1974	0.05999	4.585	0.04006	0.02250	25.53	0.1444	0.2430
2	M	11.42	386.1	0.2414	0.09744	3.445	0.07458	0.05963	26.50	0.2098	0.2575
3	M	20.29	1297.0	0.1980	0.05883	5.438	0.02461	0.01756	16.67	0.1374	0.1625
4	M	12.45	477.1	0.1578	0.07613	2.217	0.03345	0.02165	23.75	0.1791	0.1741

	radius	texture	perimeter	area	smoothness	comactness	concavity	concave points	symmetry	fractal dimension
563	21.56	1479.0	0.24390	0.05623	7.673	0.02891	0.01114	26.40	0.14100	0.2216
564	20.13	1261.0	0.14400	0.05533	5.203	0.02423	0.01898	38.25	0.11660	0.1628
565	16.60	858.1	0.09251	0.05648	3.425	0.03731	0.01318	34.12	0.11390	0.1418
566	20.60	1265.0	0.35140	0.07016	5.772	0.06158	0.02324	39.42	0.16500	0.2650
567	7.76	181.0	0.00000	0.05884	2.548	0.00466	0.02676	30.37	0.08996	0.0000