In [2]:
import pandas as pd
import numpy as np

In [3]:
column_names=['sample code number','clump thickness','uniformity of cell size','uniformity of cell shape','marginal adhesion',
              'single epithelial cell size','bare nuclei','bland chromatin', 'normal nucleoli','mitoses','class']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',
                   names=column_names)

In [4]:
data = data.replace(to_replace='?',value=np.nan)
data=data.dropna(how='any')
data.shape


Out[4]:
(683, 11)

In [10]:
# help(pd)
# pd.__file__
# import sklearn
# sklearn.__file__
# help(sklearn)

In [5]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=33)


/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [6]:
y_train.value_counts()


Out[6]:
2    344
4    168
Name: class, dtype: int64

In [7]:
y_test.value_counts()


Out[7]:
2    100
4     71
Name: class, dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [9]:
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [10]:
lr=LogisticRegression()
sgdc=SGDClassifier()
lr.fit(X_train,y_train)
lr_y_predict=lr.predict(X_test)
sgdc.fit(X_train,y_train)
sgdc_y_predict=sgdc.predict(X_test)

In [12]:
from sklearn.metrics import classification_report
print 'Accuracy of LR Classifier:', lr.score(X_test,y_test)
print classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant'])


Accuracy of LR Classifier: 0.988304093567
             precision    recall  f1-score   support

     Benign       0.99      0.99      0.99       100
   Maligant       0.99      0.99      0.99        71

avg / total       0.99      0.99      0.99       171


In [16]:
print 'Accuracy of SGC Classifier:', sgdc.score(X_test,y_test)
print classification_report(y_test,sgdc_y_predict,target_names=['Benign','Malignant'])


Accuracy of SGC Classifier: 0.941520467836
             precision    recall  f1-score   support

     Benign       0.91      1.00      0.95       100
  Malignant       1.00      0.86      0.92        71

avg / total       0.95      0.94      0.94       171