In [1]:
import pandas as pd
%matplotlib inline
In [2]:
from sklearn import datasets
from pandas.tools.plotting import scatter_matrix
In [3]:
import matplotlib.pyplot as plt
In [4]:
iris = datasets.load_iris()
In [5]:
iris
Out[5]:
{'DESCR': 'Iris Plants Database\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n :Date: July, 1988\n\nThis is a copy of UCI ML iris datasets.\nhttp://archive.ics.uci.edu/ml/datasets/Iris\n\nThe famous Iris database, first used by Sir R.A Fisher\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\nReferences\n----------\n - Fisher,R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...\n',
'data': array([[ 5.1, 3.5, 1.4, 0.2],
[ 4.9, 3. , 1.4, 0.2],
[ 4.7, 3.2, 1.3, 0.2],
[ 4.6, 3.1, 1.5, 0.2],
[ 5. , 3.6, 1.4, 0.2],
[ 5.4, 3.9, 1.7, 0.4],
[ 4.6, 3.4, 1.4, 0.3],
[ 5. , 3.4, 1.5, 0.2],
[ 4.4, 2.9, 1.4, 0.2],
[ 4.9, 3.1, 1.5, 0.1],
[ 5.4, 3.7, 1.5, 0.2],
[ 4.8, 3.4, 1.6, 0.2],
[ 4.8, 3. , 1.4, 0.1],
[ 4.3, 3. , 1.1, 0.1],
[ 5.8, 4. , 1.2, 0.2],
[ 5.7, 4.4, 1.5, 0.4],
[ 5.4, 3.9, 1.3, 0.4],
[ 5.1, 3.5, 1.4, 0.3],
[ 5.7, 3.8, 1.7, 0.3],
[ 5.1, 3.8, 1.5, 0.3],
[ 5.4, 3.4, 1.7, 0.2],
[ 5.1, 3.7, 1.5, 0.4],
[ 4.6, 3.6, 1. , 0.2],
[ 5.1, 3.3, 1.7, 0.5],
[ 4.8, 3.4, 1.9, 0.2],
[ 5. , 3. , 1.6, 0.2],
[ 5. , 3.4, 1.6, 0.4],
[ 5.2, 3.5, 1.5, 0.2],
[ 5.2, 3.4, 1.4, 0.2],
[ 4.7, 3.2, 1.6, 0.2],
[ 4.8, 3.1, 1.6, 0.2],
[ 5.4, 3.4, 1.5, 0.4],
[ 5.2, 4.1, 1.5, 0.1],
[ 5.5, 4.2, 1.4, 0.2],
[ 4.9, 3.1, 1.5, 0.1],
[ 5. , 3.2, 1.2, 0.2],
[ 5.5, 3.5, 1.3, 0.2],
[ 4.9, 3.1, 1.5, 0.1],
[ 4.4, 3. , 1.3, 0.2],
[ 5.1, 3.4, 1.5, 0.2],
[ 5. , 3.5, 1.3, 0.3],
[ 4.5, 2.3, 1.3, 0.3],
[ 4.4, 3.2, 1.3, 0.2],
[ 5. , 3.5, 1.6, 0.6],
[ 5.1, 3.8, 1.9, 0.4],
[ 4.8, 3. , 1.4, 0.3],
[ 5.1, 3.8, 1.6, 0.2],
[ 4.6, 3.2, 1.4, 0.2],
[ 5.3, 3.7, 1.5, 0.2],
[ 5. , 3.3, 1.4, 0.2],
[ 7. , 3.2, 4.7, 1.4],
[ 6.4, 3.2, 4.5, 1.5],
[ 6.9, 3.1, 4.9, 1.5],
[ 5.5, 2.3, 4. , 1.3],
[ 6.5, 2.8, 4.6, 1.5],
[ 5.7, 2.8, 4.5, 1.3],
[ 6.3, 3.3, 4.7, 1.6],
[ 4.9, 2.4, 3.3, 1. ],
[ 6.6, 2.9, 4.6, 1.3],
[ 5.2, 2.7, 3.9, 1.4],
[ 5. , 2. , 3.5, 1. ],
[ 5.9, 3. , 4.2, 1.5],
[ 6. , 2.2, 4. , 1. ],
[ 6.1, 2.9, 4.7, 1.4],
[ 5.6, 2.9, 3.6, 1.3],
[ 6.7, 3.1, 4.4, 1.4],
[ 5.6, 3. , 4.5, 1.5],
[ 5.8, 2.7, 4.1, 1. ],
[ 6.2, 2.2, 4.5, 1.5],
[ 5.6, 2.5, 3.9, 1.1],
[ 5.9, 3.2, 4.8, 1.8],
[ 6.1, 2.8, 4. , 1.3],
[ 6.3, 2.5, 4.9, 1.5],
[ 6.1, 2.8, 4.7, 1.2],
[ 6.4, 2.9, 4.3, 1.3],
[ 6.6, 3. , 4.4, 1.4],
[ 6.8, 2.8, 4.8, 1.4],
[ 6.7, 3. , 5. , 1.7],
[ 6. , 2.9, 4.5, 1.5],
[ 5.7, 2.6, 3.5, 1. ],
[ 5.5, 2.4, 3.8, 1.1],
[ 5.5, 2.4, 3.7, 1. ],
[ 5.8, 2.7, 3.9, 1.2],
[ 6. , 2.7, 5.1, 1.6],
[ 5.4, 3. , 4.5, 1.5],
[ 6. , 3.4, 4.5, 1.6],
[ 6.7, 3.1, 4.7, 1.5],
[ 6.3, 2.3, 4.4, 1.3],
[ 5.6, 3. , 4.1, 1.3],
[ 5.5, 2.5, 4. , 1.3],
[ 5.5, 2.6, 4.4, 1.2],
[ 6.1, 3. , 4.6, 1.4],
[ 5.8, 2.6, 4. , 1.2],
[ 5. , 2.3, 3.3, 1. ],
[ 5.6, 2.7, 4.2, 1.3],
[ 5.7, 3. , 4.2, 1.2],
[ 5.7, 2.9, 4.2, 1.3],
[ 6.2, 2.9, 4.3, 1.3],
[ 5.1, 2.5, 3. , 1.1],
[ 5.7, 2.8, 4.1, 1.3],
[ 6.3, 3.3, 6. , 2.5],
[ 5.8, 2.7, 5.1, 1.9],
[ 7.1, 3. , 5.9, 2.1],
[ 6.3, 2.9, 5.6, 1.8],
[ 6.5, 3. , 5.8, 2.2],
[ 7.6, 3. , 6.6, 2.1],
[ 4.9, 2.5, 4.5, 1.7],
[ 7.3, 2.9, 6.3, 1.8],
[ 6.7, 2.5, 5.8, 1.8],
[ 7.2, 3.6, 6.1, 2.5],
[ 6.5, 3.2, 5.1, 2. ],
[ 6.4, 2.7, 5.3, 1.9],
[ 6.8, 3. , 5.5, 2.1],
[ 5.7, 2.5, 5. , 2. ],
[ 5.8, 2.8, 5.1, 2.4],
[ 6.4, 3.2, 5.3, 2.3],
[ 6.5, 3. , 5.5, 1.8],
[ 7.7, 3.8, 6.7, 2.2],
[ 7.7, 2.6, 6.9, 2.3],
[ 6. , 2.2, 5. , 1.5],
[ 6.9, 3.2, 5.7, 2.3],
[ 5.6, 2.8, 4.9, 2. ],
[ 7.7, 2.8, 6.7, 2. ],
[ 6.3, 2.7, 4.9, 1.8],
[ 6.7, 3.3, 5.7, 2.1],
[ 7.2, 3.2, 6. , 1.8],
[ 6.2, 2.8, 4.8, 1.8],
[ 6.1, 3. , 4.9, 1.8],
[ 6.4, 2.8, 5.6, 2.1],
[ 7.2, 3. , 5.8, 1.6],
[ 7.4, 2.8, 6.1, 1.9],
[ 7.9, 3.8, 6.4, 2. ],
[ 6.4, 2.8, 5.6, 2.2],
[ 6.3, 2.8, 5.1, 1.5],
[ 6.1, 2.6, 5.6, 1.4],
[ 7.7, 3. , 6.1, 2.3],
[ 6.3, 3.4, 5.6, 2.4],
[ 6.4, 3.1, 5.5, 1.8],
[ 6. , 3. , 4.8, 1.8],
[ 6.9, 3.1, 5.4, 2.1],
[ 6.7, 3.1, 5.6, 2.4],
[ 6.9, 3.1, 5.1, 2.3],
[ 5.8, 2.7, 5.1, 1.9],
[ 6.8, 3.2, 5.9, 2.3],
[ 6.7, 3.3, 5.7, 2.5],
[ 6.7, 3. , 5.2, 2.3],
[ 6.3, 2.5, 5. , 1.9],
[ 6.5, 3. , 5.2, 2. ],
[ 6.2, 3.4, 5.4, 2.3],
[ 5.9, 3. , 5.1, 1.8]]),
'feature_names': ['sepal length (cm)',
'sepal width (cm)',
'petal length (cm)',
'petal width (cm)'],
'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
'target_names': array(['setosa', 'versicolor', 'virginica'],
dtype='<U10')}
In [6]:
x = iris.data[:,:2]
y = iris.target
In [7]:
from sklearn import tree
In [8]:
from sklearn.cross_validation import train_test_split
In [9]:
dt = tree.DecisionTreeClassifier()
In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
In [11]:
dt = dt.fit(x_train,y_train)
In [12]:
from sklearn import metrics
In [13]:
import numpy as np
In [14]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [15]:
measure_performance(x_train,y_train,dt)
Accuracy:0.947
Classification report
precision recall f1-score support
0 1.00 1.00 1.00 21
1 0.87 1.00 0.93 26
2 1.00 0.86 0.92 28
avg / total 0.95 0.95 0.95 75
Confusion matrix
[[21 0 0]
[ 0 26 0]
[ 0 4 24]]
In [16]:
measure_performance(x_test,y_test,dt)
Accuracy:0.707
Classification report
precision recall f1-score support
0 0.97 0.97 0.97 29
1 0.55 0.46 0.50 24
2 0.54 0.64 0.58 22
avg / total 0.71 0.71 0.70 75
Confusion matrix
[[28 1 0]
[ 1 11 12]
[ 0 8 14]]
Every time I run the code, the result is different, i.e. the accuracy level varies. Sometimes the training dataset does better, sometimes is the other way around. Since I do not really understand what's going on under the hood, I am not able to go further into my analysis. However, I would say that, in general, I could not trust the results of such a model, because, as a journalist, I am accountable for everything I produce. In this case, I could not rely on a "source" that is changing its version every single time I interview him/her.
In [17]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.75,train_size=0.25)
In [18]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [19]:
measure_performance(x_train,y_train,dt)
Accuracy:0.730
Classification report
precision recall f1-score support
0 1.00 1.00 1.00 12
1 0.62 0.42 0.50 12
2 0.59 0.77 0.67 13
avg / total 0.73 0.73 0.72 37
Confusion matrix
[[12 0 0]
[ 0 5 7]
[ 0 3 10]]
In [20]:
measure_performance(x_test,y_test,dt)
Accuracy:0.858
Classification report
precision recall f1-score support
0 0.97 0.97 0.97 38
1 0.76 0.84 0.80 38
2 0.85 0.76 0.80 37
avg / total 0.86 0.86 0.86 113
Confusion matrix
[[37 1 0]
[ 1 32 5]
[ 0 9 28]]
In this case, the test dataset seems to do a little bit better than the training dataset. I guess that is because the larger the training dataset, the more accuracy it can achieve. On the other hand, the smaller the test data, the easier it is for the model to be accurate.
datasets.load_breast_cancer()
) and perform basic exploratory analysis. What attributes to we have? What are we trying to predict?For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
In [21]:
breast_cancer = datasets.load_breast_cancer()
In [22]:
breast_cancer
Out[22]:
{'DESCR': 'Breast Cancer Wisconsin (Diagnostic) Database\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 569\n\n :Number of Attributes: 30 numeric, predictive attributes and the class\n\n :Attribute Information:\n - radius (mean of distances from center to points on the perimeter)\n - texture (standard deviation of gray-scale values)\n - perimeter\n - area\n - smoothness (local variation in radius lengths)\n - compactness (perimeter^2 / area - 1.0)\n - concavity (severity of concave portions of the contour)\n - concave points (number of concave portions of the contour)\n - symmetry \n - fractal dimension ("coastline approximation" - 1)\n \n The mean, standard error, and "worst" or largest (mean of the three\n largest values) of these features were computed for each image,\n resulting in 30 features. For instance, field 3 is Mean Radius, field\n 13 is Radius SE, field 23 is Worst Radius.\n \n - class:\n - WDBC-Malignant\n - WDBC-Benign\n\n :Summary Statistics:\n\n ===================================== ======= ========\n Min Max\n ===================================== ======= ========\n radius (mean): 6.981 28.11\n texture (mean): 9.71 39.28\n perimeter (mean): 43.79 188.5\n area (mean): 143.5 2501.0\n smoothness (mean): 0.053 0.163\n compactness (mean): 0.019 0.345\n concavity (mean): 0.0 0.427\n concave points (mean): 0.0 0.201\n symmetry (mean): 0.106 0.304\n fractal dimension (mean): 0.05 0.097\n radius (standard error): 0.112 2.873\n texture (standard error): 0.36 4.885\n perimeter (standard error): 0.757 21.98\n area (standard error): 6.802 542.2\n smoothness (standard error): 0.002 0.031\n compactness (standard error): 0.002 0.135\n concavity (standard error): 0.0 0.396\n concave points (standard error): 0.0 0.053\n symmetry (standard error): 0.008 0.079\n fractal dimension (standard error): 0.001 0.03\n radius (worst): 7.93 36.04\n texture (worst): 12.02 49.54\n perimeter (worst): 50.41 251.2\n area (worst): 185.2 4254.0\n smoothness (worst): 0.071 0.223\n compactness (worst): 0.027 1.058\n concavity (worst): 0.0 1.252\n concave points (worst): 0.0 0.291\n symmetry (worst): 0.156 0.664\n fractal dimension (worst): 0.055 0.208\n ===================================== ======= ========\n\n :Missing Attribute Values: None\n\n :Class Distribution: 212 - Malignant, 357 - Benign\n\n :Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian\n\n :Donor: Nick Street\n\n :Date: November, 1995\n\nThis is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.\nhttps://goo.gl/U2Uwz2\n\nFeatures are computed from a digitized image of a fine needle\naspirate (FNA) of a breast mass. They describe\ncharacteristics of the cell nuclei present in the image.\nA few of the images can be found at\nhttp://www.cs.wisc.edu/~street/images/\n\nSeparating plane described above was obtained using\nMultisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree\nConstruction Via Linear Programming." Proceedings of the 4th\nMidwest Artificial Intelligence and Cognitive Science Society,\npp. 97-101, 1992], a classification method which uses linear\nprogramming to construct a decision tree. Relevant features\nwere selected using an exhaustive search in the space of 1-4\nfeatures and 1-3 separating planes.\n\nThe actual linear program used to obtain the separating plane\nin the 3-dimensional space is that described in:\n[K. P. Bennett and O. L. Mangasarian: "Robust Linear\nProgramming Discrimination of Two Linearly Inseparable Sets",\nOptimization Methods and Software 1, 1992, 23-34].\n\nThis database is also available through the UW CS ftp server:\n\nftp ftp.cs.wisc.edu\ncd math-prog/cpo-dataset/machine-learn/WDBC/\n\nReferences\n----------\n - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \n for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \n Electronic Imaging: Science and Technology, volume 1905, pages 861-870, \n San Jose, CA, 1993. \n - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and \n prognosis via linear programming. Operations Research, 43(4), pages 570-577, \n July-August 1995.\n - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques\n to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \n 163-171.\n',
'data': array([[ 1.79900000e+01, 1.03800000e+01, 1.22800000e+02, ...,
2.65400000e-01, 4.60100000e-01, 1.18900000e-01],
[ 2.05700000e+01, 1.77700000e+01, 1.32900000e+02, ...,
1.86000000e-01, 2.75000000e-01, 8.90200000e-02],
[ 1.96900000e+01, 2.12500000e+01, 1.30000000e+02, ...,
2.43000000e-01, 3.61300000e-01, 8.75800000e-02],
...,
[ 1.66000000e+01, 2.80800000e+01, 1.08300000e+02, ...,
1.41800000e-01, 2.21800000e-01, 7.82000000e-02],
[ 2.06000000e+01, 2.93300000e+01, 1.40100000e+02, ...,
2.65000000e-01, 4.08700000e-01, 1.24000000e-01],
[ 7.76000000e+00, 2.45400000e+01, 4.79200000e+01, ...,
0.00000000e+00, 2.87100000e-01, 7.03900000e-02]]),
'feature_names': array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension'],
dtype='<U23'),
'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]),
'target_names': array(['malignant', 'benign'],
dtype='<U9')}
In [23]:
breast_cancer.keys()
Out[23]:
dict_keys(['feature_names', 'target_names', 'target', 'DESCR', 'data'])
In [24]:
breast_cancer['feature_names']
Out[24]:
array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension'],
dtype='<U23')
In [25]:
type(breast_cancer)
Out[25]:
sklearn.datasets.base.Bunch
In [26]:
df = pd.DataFrame(breast_cancer.data, columns=breast_cancer['feature_names'])
In [27]:
df
Out[27]:
mean radius
mean texture
mean perimeter
mean area
mean smoothness
mean compactness
mean concavity
mean concave points
mean symmetry
mean fractal dimension
...
worst radius
worst texture
worst perimeter
worst area
worst smoothness
worst compactness
worst concavity
worst concave points
worst symmetry
worst fractal dimension
0
17.990
10.38
122.80
1001.0
0.11840
0.27760
0.300100
0.147100
0.2419
0.07871
...
25.380
17.33
184.60
2019.0
0.16220
0.66560
0.71190
0.26540
0.4601
0.11890
1
20.570
17.77
132.90
1326.0
0.08474
0.07864
0.086900
0.070170
0.1812
0.05667
...
24.990
23.41
158.80
1956.0
0.12380
0.18660
0.24160
0.18600
0.2750
0.08902
2
19.690
21.25
130.00
1203.0
0.10960
0.15990
0.197400
0.127900
0.2069
0.05999
...
23.570
25.53
152.50
1709.0
0.14440
0.42450
0.45040
0.24300
0.3613
0.08758
3
11.420
20.38
77.58
386.1
0.14250
0.28390
0.241400
0.105200
0.2597
0.09744
...
14.910
26.50
98.87
567.7
0.20980
0.86630
0.68690
0.25750
0.6638
0.17300
4
20.290
14.34
135.10
1297.0
0.10030
0.13280
0.198000
0.104300
0.1809
0.05883
...
22.540
16.67
152.20
1575.0
0.13740
0.20500
0.40000
0.16250
0.2364
0.07678
5
12.450
15.70
82.57
477.1
0.12780
0.17000
0.157800
0.080890
0.2087
0.07613
...
15.470
23.75
103.40
741.6
0.17910
0.52490
0.53550
0.17410
0.3985
0.12440
6
18.250
19.98
119.60
1040.0
0.09463
0.10900
0.112700
0.074000
0.1794
0.05742
...
22.880
27.66
153.20
1606.0
0.14420
0.25760
0.37840
0.19320
0.3063
0.08368
7
13.710
20.83
90.20
577.9
0.11890
0.16450
0.093660
0.059850
0.2196
0.07451
...
17.060
28.14
110.60
897.0
0.16540
0.36820
0.26780
0.15560
0.3196
0.11510
8
13.000
21.82
87.50
519.8
0.12730
0.19320
0.185900
0.093530
0.2350
0.07389
...
15.490
30.73
106.20
739.3
0.17030
0.54010
0.53900
0.20600
0.4378
0.10720
9
12.460
24.04
83.97
475.9
0.11860
0.23960
0.227300
0.085430
0.2030
0.08243
...
15.090
40.68
97.65
711.4
0.18530
1.05800
1.10500
0.22100
0.4366
0.20750
10
16.020
23.24
102.70
797.8
0.08206
0.06669
0.032990
0.033230
0.1528
0.05697
...
19.190
33.88
123.80
1150.0
0.11810
0.15510
0.14590
0.09975
0.2948
0.08452
11
15.780
17.89
103.60
781.0
0.09710
0.12920
0.099540
0.066060
0.1842
0.06082
...
20.420
27.28
136.50
1299.0
0.13960
0.56090
0.39650
0.18100
0.3792
0.10480
12
19.170
24.80
132.40
1123.0
0.09740
0.24580
0.206500
0.111800
0.2397
0.07800
...
20.960
29.94
151.70
1332.0
0.10370
0.39030
0.36390
0.17670
0.3176
0.10230
13
15.850
23.95
103.70
782.7
0.08401
0.10020
0.099380
0.053640
0.1847
0.05338
...
16.840
27.66
112.00
876.5
0.11310
0.19240
0.23220
0.11190
0.2809
0.06287
14
13.730
22.61
93.60
578.3
0.11310
0.22930
0.212800
0.080250
0.2069
0.07682
...
15.030
32.01
108.80
697.7
0.16510
0.77250
0.69430
0.22080
0.3596
0.14310
15
14.540
27.54
96.73
658.8
0.11390
0.15950
0.163900
0.073640
0.2303
0.07077
...
17.460
37.13
124.10
943.2
0.16780
0.65770
0.70260
0.17120
0.4218
0.13410
16
14.680
20.13
94.74
684.5
0.09867
0.07200
0.073950
0.052590
0.1586
0.05922
...
19.070
30.88
123.40
1138.0
0.14640
0.18710
0.29140
0.16090
0.3029
0.08216
17
16.130
20.68
108.10
798.8
0.11700
0.20220
0.172200
0.102800
0.2164
0.07356
...
20.960
31.48
136.80
1315.0
0.17890
0.42330
0.47840
0.20730
0.3706
0.11420
18
19.810
22.15
130.00
1260.0
0.09831
0.10270
0.147900
0.094980
0.1582
0.05395
...
27.320
30.88
186.80
2398.0
0.15120
0.31500
0.53720
0.23880
0.2768
0.07615
19
13.540
14.36
87.46
566.3
0.09779
0.08129
0.066640
0.047810
0.1885
0.05766
...
15.110
19.26
99.70
711.2
0.14400
0.17730
0.23900
0.12880
0.2977
0.07259
20
13.080
15.71
85.63
520.0
0.10750
0.12700
0.045680
0.031100
0.1967
0.06811
...
14.500
20.49
96.09
630.5
0.13120
0.27760
0.18900
0.07283
0.3184
0.08183
21
9.504
12.44
60.34
273.9
0.10240
0.06492
0.029560
0.020760
0.1815
0.06905
...
10.230
15.66
65.13
314.9
0.13240
0.11480
0.08867
0.06227
0.2450
0.07773
22
15.340
14.26
102.50
704.4
0.10730
0.21350
0.207700
0.097560
0.2521
0.07032
...
18.070
19.08
125.10
980.9
0.13900
0.59540
0.63050
0.23930
0.4667
0.09946
23
21.160
23.04
137.20
1404.0
0.09428
0.10220
0.109700
0.086320
0.1769
0.05278
...
29.170
35.59
188.00
2615.0
0.14010
0.26000
0.31550
0.20090
0.2822
0.07526
24
16.650
21.38
110.00
904.6
0.11210
0.14570
0.152500
0.091700
0.1995
0.06330
...
26.460
31.56
177.00
2215.0
0.18050
0.35780
0.46950
0.20950
0.3613
0.09564
25
17.140
16.40
116.00
912.7
0.11860
0.22760
0.222900
0.140100
0.3040
0.07413
...
22.250
21.40
152.40
1461.0
0.15450
0.39490
0.38530
0.25500
0.4066
0.10590
26
14.580
21.53
97.41
644.8
0.10540
0.18680
0.142500
0.087830
0.2252
0.06924
...
17.620
33.21
122.40
896.9
0.15250
0.66430
0.55390
0.27010
0.4264
0.12750
27
18.610
20.25
122.10
1094.0
0.09440
0.10660
0.149000
0.077310
0.1697
0.05699
...
21.310
27.26
139.90
1403.0
0.13380
0.21170
0.34460
0.14900
0.2341
0.07421
28
15.300
25.27
102.40
732.4
0.10820
0.16970
0.168300
0.087510
0.1926
0.06540
...
20.270
36.71
149.30
1269.0
0.16410
0.61100
0.63350
0.20240
0.4027
0.09876
29
17.570
15.05
115.00
955.1
0.09847
0.11570
0.098750
0.079530
0.1739
0.06149
...
20.010
19.52
134.90
1227.0
0.12550
0.28120
0.24890
0.14560
0.2756
0.07919
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
539
7.691
25.44
48.34
170.4
0.08668
0.11990
0.092520
0.013640
0.2037
0.07751
...
8.678
31.89
54.49
223.6
0.15960
0.30640
0.33930
0.05000
0.2790
0.10660
540
11.540
14.44
74.65
402.9
0.09984
0.11200
0.067370
0.025940
0.1818
0.06782
...
12.260
19.68
78.78
457.8
0.13450
0.21180
0.17970
0.06918
0.2329
0.08134
541
14.470
24.99
95.81
656.4
0.08837
0.12300
0.100900
0.038900
0.1872
0.06341
...
16.220
31.73
113.50
808.9
0.13400
0.42020
0.40400
0.12050
0.3187
0.10230
542
14.740
25.42
94.70
668.6
0.08275
0.07214
0.041050
0.030270
0.1840
0.05680
...
16.510
32.29
107.40
826.4
0.10600
0.13760
0.16110
0.10950
0.2722
0.06956
543
13.210
28.06
84.88
538.4
0.08671
0.06877
0.029870
0.032750
0.1628
0.05781
...
14.370
37.17
92.48
629.6
0.10720
0.13810
0.10620
0.07958
0.2473
0.06443
544
13.870
20.70
89.77
584.8
0.09578
0.10180
0.036880
0.023690
0.1620
0.06688
...
15.050
24.75
99.17
688.6
0.12640
0.20370
0.13770
0.06845
0.2249
0.08492
545
13.620
23.23
87.19
573.2
0.09246
0.06747
0.029740
0.024430
0.1664
0.05801
...
15.350
29.09
97.58
729.8
0.12160
0.15170
0.10490
0.07174
0.2642
0.06953
546
10.320
16.35
65.31
324.9
0.09434
0.04994
0.010120
0.005495
0.1885
0.06201
...
11.250
21.77
71.12
384.9
0.12850
0.08842
0.04384
0.02381
0.2681
0.07399
547
10.260
16.58
65.85
320.8
0.08877
0.08066
0.043580
0.024380
0.1669
0.06714
...
10.830
22.04
71.08
357.4
0.14610
0.22460
0.17830
0.08333
0.2691
0.09479
548
9.683
19.34
61.05
285.7
0.08491
0.05030
0.023370
0.009615
0.1580
0.06235
...
10.930
25.59
69.10
364.2
0.11990
0.09546
0.09350
0.03846
0.2552
0.07920
549
10.820
24.21
68.89
361.6
0.08192
0.06602
0.015480
0.008160
0.1976
0.06328
...
13.030
31.45
83.90
505.6
0.12040
0.16330
0.06194
0.03264
0.3059
0.07626
550
10.860
21.48
68.51
360.5
0.07431
0.04227
0.000000
0.000000
0.1661
0.05948
...
11.660
24.77
74.08
412.3
0.10010
0.07348
0.00000
0.00000
0.2458
0.06592
551
11.130
22.44
71.49
378.4
0.09566
0.08194
0.048240
0.022570
0.2030
0.06552
...
12.020
28.26
77.80
436.6
0.10870
0.17820
0.15640
0.06413
0.3169
0.08032
552
12.770
29.43
81.35
507.9
0.08276
0.04234
0.019970
0.014990
0.1539
0.05637
...
13.870
36.00
88.10
594.7
0.12340
0.10640
0.08653
0.06498
0.2407
0.06484
553
9.333
21.94
59.01
264.0
0.09240
0.05605
0.039960
0.012820
0.1692
0.06576
...
9.845
25.05
62.86
295.8
0.11030
0.08298
0.07993
0.02564
0.2435
0.07393
554
12.880
28.92
82.50
514.3
0.08123
0.05824
0.061950
0.023430
0.1566
0.05708
...
13.890
35.74
88.84
595.7
0.12270
0.16200
0.24390
0.06493
0.2372
0.07242
555
10.290
27.61
65.67
321.4
0.09030
0.07658
0.059990
0.027380
0.1593
0.06127
...
10.840
34.91
69.57
357.6
0.13840
0.17100
0.20000
0.09127
0.2226
0.08283
556
10.160
19.59
64.73
311.7
0.10030
0.07504
0.005025
0.011160
0.1791
0.06331
...
10.650
22.88
67.88
347.3
0.12650
0.12000
0.01005
0.02232
0.2262
0.06742
557
9.423
27.88
59.26
271.3
0.08123
0.04971
0.000000
0.000000
0.1742
0.06059
...
10.490
34.24
66.50
330.6
0.10730
0.07158
0.00000
0.00000
0.2475
0.06969
558
14.590
22.68
96.39
657.1
0.08473
0.13300
0.102900
0.037360
0.1454
0.06147
...
15.480
27.27
105.90
733.5
0.10260
0.31710
0.36620
0.11050
0.2258
0.08004
559
11.510
23.93
74.52
403.5
0.09261
0.10210
0.111200
0.041050
0.1388
0.06570
...
12.480
37.16
82.28
474.2
0.12980
0.25170
0.36300
0.09653
0.2112
0.08732
560
14.050
27.15
91.38
600.4
0.09929
0.11260
0.044620
0.043040
0.1537
0.06171
...
15.300
33.17
100.20
706.7
0.12410
0.22640
0.13260
0.10480
0.2250
0.08321
561
11.200
29.37
70.67
386.0
0.07449
0.03558
0.000000
0.000000
0.1060
0.05502
...
11.920
38.30
75.19
439.6
0.09267
0.05494
0.00000
0.00000
0.1566
0.05905
562
15.220
30.62
103.40
716.9
0.10480
0.20870
0.255000
0.094290
0.2128
0.07152
...
17.520
42.79
128.70
915.0
0.14170
0.79170
1.17000
0.23560
0.4089
0.14090
563
20.920
25.09
143.00
1347.0
0.10990
0.22360
0.317400
0.147400
0.2149
0.06879
...
24.290
29.41
179.10
1819.0
0.14070
0.41860
0.65990
0.25420
0.2929
0.09873
564
21.560
22.39
142.00
1479.0
0.11100
0.11590
0.243900
0.138900
0.1726
0.05623
...
25.450
26.40
166.10
2027.0
0.14100
0.21130
0.41070
0.22160
0.2060
0.07115
565
20.130
28.25
131.20
1261.0
0.09780
0.10340
0.144000
0.097910
0.1752
0.05533
...
23.690
38.25
155.00
1731.0
0.11660
0.19220
0.32150
0.16280
0.2572
0.06637
566
16.600
28.08
108.30
858.1
0.08455
0.10230
0.092510
0.053020
0.1590
0.05648
...
18.980
34.12
126.70
1124.0
0.11390
0.30940
0.34030
0.14180
0.2218
0.07820
567
20.600
29.33
140.10
1265.0
0.11780
0.27700
0.351400
0.152000
0.2397
0.07016
...
25.740
39.42
184.60
1821.0
0.16500
0.86810
0.93870
0.26500
0.4087
0.12400
568
7.760
24.54
47.92
181.0
0.05263
0.04362
0.000000
0.000000
0.1587
0.05884
...
9.456
30.37
59.16
268.6
0.08996
0.06444
0.00000
0.00000
0.2871
0.07039
569 rows × 30 columns
In [28]:
df.corr()
Out[28]:
mean radius
mean texture
mean perimeter
mean area
mean smoothness
mean compactness
mean concavity
mean concave points
mean symmetry
mean fractal dimension
...
worst radius
worst texture
worst perimeter
worst area
worst smoothness
worst compactness
worst concavity
worst concave points
worst symmetry
worst fractal dimension
mean radius
1.000000
0.323782
0.997855
0.987357
0.170581
0.506124
0.676764
0.822529
0.147741
-0.311631
...
0.969539
0.297008
0.965137
0.941082
0.119616
0.413463
0.526911
0.744214
0.163953
0.007066
mean texture
0.323782
1.000000
0.329533
0.321086
-0.023389
0.236702
0.302418
0.293464
0.071401
-0.076437
...
0.352573
0.912045
0.358040
0.343546
0.077503
0.277830
0.301025
0.295316
0.105008
0.119205
mean perimeter
0.997855
0.329533
1.000000
0.986507
0.207278
0.556936
0.716136
0.850977
0.183027
-0.261477
...
0.969476
0.303038
0.970387
0.941550
0.150549
0.455774
0.563879
0.771241
0.189115
0.051019
mean area
0.987357
0.321086
0.986507
1.000000
0.177028
0.498502
0.685983
0.823269
0.151293
-0.283110
...
0.962746
0.287489
0.959120
0.959213
0.123523
0.390410
0.512606
0.722017
0.143570
0.003738
mean smoothness
0.170581
-0.023389
0.207278
0.177028
1.000000
0.659123
0.521984
0.553695
0.557775
0.584792
...
0.213120
0.036072
0.238853
0.206718
0.805324
0.472468
0.434926
0.503053
0.394309
0.499316
mean compactness
0.506124
0.236702
0.556936
0.498502
0.659123
1.000000
0.883121
0.831135
0.602641
0.565369
...
0.535315
0.248133
0.590210
0.509604
0.565541
0.865809
0.816275
0.815573
0.510223
0.687382
mean concavity
0.676764
0.302418
0.716136
0.685983
0.521984
0.883121
1.000000
0.921391
0.500667
0.336783
...
0.688236
0.299879
0.729565
0.675987
0.448822
0.754968
0.884103
0.861323
0.409464
0.514930
mean concave points
0.822529
0.293464
0.850977
0.823269
0.553695
0.831135
0.921391
1.000000
0.462497
0.166917
...
0.830318
0.292752
0.855923
0.809630
0.452753
0.667454
0.752399
0.910155
0.375744
0.368661
mean symmetry
0.147741
0.071401
0.183027
0.151293
0.557775
0.602641
0.500667
0.462497
1.000000
0.479921
...
0.185728
0.090651
0.219169
0.177193
0.426675
0.473200
0.433721
0.430297
0.699826
0.438413
mean fractal dimension
-0.311631
-0.076437
-0.261477
-0.283110
0.584792
0.565369
0.336783
0.166917
0.479921
1.000000
...
-0.253691
-0.051269
-0.205151
-0.231854
0.504942
0.458798
0.346234
0.175325
0.334019
0.767297
radius error
0.679090
0.275869
0.691765
0.732562
0.301467
0.497473
0.631925
0.698050
0.303379
0.000111
...
0.715065
0.194799
0.719684
0.751548
0.141919
0.287103
0.380585
0.531062
0.094543
0.049559
texture error
-0.097317
0.386358
-0.086761
-0.066280
0.068406
0.046205
0.076218
0.021480
0.128053
0.164174
...
-0.111690
0.409003
-0.102242
-0.083195
-0.073658
-0.092439
-0.068956
-0.119638
-0.128215
-0.045655
perimeter error
0.674172
0.281673
0.693135
0.726628
0.296092
0.548905
0.660391
0.710650
0.313893
0.039830
...
0.697201
0.200371
0.721031
0.730713
0.130054
0.341919
0.418899
0.554897
0.109930
0.085433
area error
0.735864
0.259845
0.744983
0.800086
0.246552
0.455653
0.617427
0.690299
0.223970
-0.090170
...
0.757373
0.196497
0.761213
0.811408
0.125389
0.283257
0.385100
0.538166
0.074126
0.017539
smoothness error
-0.222600
0.006614
-0.202694
-0.166777
0.332375
0.135299
0.098564
0.027653
0.187321
0.401964
...
-0.230691
-0.074743
-0.217304
-0.182195
0.314457
-0.055558
-0.058298
-0.102007
-0.107342
0.101480
compactness error
0.206000
0.191975
0.250744
0.212583
0.318943
0.738722
0.670279
0.490424
0.421659
0.559837
...
0.204607
0.143003
0.260516
0.199371
0.227394
0.678780
0.639147
0.483208
0.277878
0.590973
concavity error
0.194204
0.143293
0.228082
0.207660
0.248396
0.570517
0.691270
0.439167
0.342627
0.446630
...
0.186904
0.100241
0.226680
0.188353
0.168481
0.484858
0.662564
0.440472
0.197788
0.439329
concave points error
0.376169
0.163851
0.407217
0.372320
0.380676
0.642262
0.683260
0.615634
0.393298
0.341198
...
0.358127
0.086741
0.394999
0.342271
0.215351
0.452888
0.549592
0.602450
0.143116
0.310655
symmetry error
-0.104321
0.009127
-0.081629
-0.072497
0.200774
0.229977
0.178009
0.095351
0.449137
0.345007
...
-0.128121
-0.077473
-0.103753
-0.110343
-0.012662
0.060255
0.037119
-0.030413
0.389402
0.078079
fractal dimension error
-0.042641
0.054458
-0.005523
-0.019887
0.283607
0.507318
0.449301
0.257584
0.331786
0.688132
...
-0.037488
-0.003195
-0.001000
-0.022736
0.170568
0.390159
0.379975
0.215204
0.111094
0.591328
worst radius
0.969539
0.352573
0.969476
0.962746
0.213120
0.535315
0.688236
0.830318
0.185728
-0.253691
...
1.000000
0.359921
0.993708
0.984015
0.216574
0.475820
0.573975
0.787424
0.243529
0.093492
worst texture
0.297008
0.912045
0.303038
0.287489
0.036072
0.248133
0.299879
0.292752
0.090651
-0.051269
...
0.359921
1.000000
0.365098
0.345842
0.225429
0.360832
0.368366
0.359755
0.233027
0.219122
worst perimeter
0.965137
0.358040
0.970387
0.959120
0.238853
0.590210
0.729565
0.855923
0.219169
-0.205151
...
0.993708
0.365098
1.000000
0.977578
0.236775
0.529408
0.618344
0.816322
0.269493
0.138957
worst area
0.941082
0.343546
0.941550
0.959213
0.206718
0.509604
0.675987
0.809630
0.177193
-0.231854
...
0.984015
0.345842
0.977578
1.000000
0.209145
0.438296
0.543331
0.747419
0.209146
0.079647
worst smoothness
0.119616
0.077503
0.150549
0.123523
0.805324
0.565541
0.448822
0.452753
0.426675
0.504942
...
0.216574
0.225429
0.236775
0.209145
1.000000
0.568187
0.518523
0.547691
0.493838
0.617624
worst compactness
0.413463
0.277830
0.455774
0.390410
0.472468
0.865809
0.754968
0.667454
0.473200
0.458798
...
0.475820
0.360832
0.529408
0.438296
0.568187
1.000000
0.892261
0.801080
0.614441
0.810455
worst concavity
0.526911
0.301025
0.563879
0.512606
0.434926
0.816275
0.884103
0.752399
0.433721
0.346234
...
0.573975
0.368366
0.618344
0.543331
0.518523
0.892261
1.000000
0.855434
0.532520
0.686511
worst concave points
0.744214
0.295316
0.771241
0.722017
0.503053
0.815573
0.861323
0.910155
0.430297
0.175325
...
0.787424
0.359755
0.816322
0.747419
0.547691
0.801080
0.855434
1.000000
0.502528
0.511114
worst symmetry
0.163953
0.105008
0.189115
0.143570
0.394309
0.510223
0.409464
0.375744
0.699826
0.334019
...
0.243529
0.233027
0.269493
0.209146
0.493838
0.614441
0.532520
0.502528
1.000000
0.537848
worst fractal dimension
0.007066
0.119205
0.051019
0.003738
0.499316
0.687382
0.514930
0.368661
0.438413
0.767297
...
0.093492
0.219122
0.138957
0.079647
0.617624
0.810455
0.686511
0.511114
0.537848
1.000000
30 rows × 30 columns
Since I am not an expert in breast cancer, I do not feel able to select the most relevant features to perform a meaningful analysis. I have therefore chosen two of them that share a high correlation: mean perimeter and mean radius.
In [30]:
x = breast_cancer.data[:,:2]
y = breast_cancer.target
In [31]:
dt = tree.DecisionTreeClassifier()
In [32]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.5,train_size=0.5)
In [33]:
dt = dt.fit(x_train,y_train)
In [34]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [37]:
measure_performance(x_train,y_train,dt)
Accuracy:1.000
Classification report
precision recall f1-score support
0 1.00 1.00 1.00 113
1 1.00 1.00 1.00 171
avg / total 1.00 1.00 1.00 284
Confusion matrix
[[113 0]
[ 0 171]]
In [38]:
measure_performance(x_test,y_test,dt)
Accuracy:0.863
Classification report
precision recall f1-score support
0 0.83 0.77 0.80 99
1 0.88 0.91 0.90 186
avg / total 0.86 0.86 0.86 285
Confusion matrix
[[ 76 23]
[ 16 170]]
In [39]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [40]:
dt = dt.fit(x_train,y_train)
In [41]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [42]:
measure_performance(x_train,y_train,dt)
Accuracy:1.000
Classification report
precision recall f1-score support
0 1.00 1.00 1.00 151
1 1.00 1.00 1.00 275
avg / total 1.00 1.00 1.00 426
Confusion matrix
[[151 0]
[ 0 275]]
In [43]:
measure_performance(x_test,y_test,dt)
Accuracy:0.825
Classification report
precision recall f1-score support
0 0.80 0.79 0.79 61
1 0.84 0.85 0.85 82
avg / total 0.82 0.83 0.82 143
Confusion matrix
[[48 13]
[12 70]]
In both cases the model seems to be overfitting the training dataset, while it shows a quite high degree of accuracy when tested on the test dataset. The former may be due to the fact that I have chosen two features that are really very correlated.
In [ ]:
Content source: ledeprogram/algorithms
Similar notebooks: