In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import assignment2_helper as helper

# Look pretty...
# matplotlib.style.use('ggplot')
plt.style.use('ggplot')

# Do * NOT * alter this line, until instructed!
scaleFeatures = False

# Create some color coded labels; the actual label feature
# will be removed prior to executing PCA, since it's unsupervised.
# You're only labeling by color so you can see the effects of PCA
labels = ['red' if i=='ckd' else 'green' for i in df.classification]

In [7]:
# TODO: Load up the dataset and remove any and all
# Rows that have a nan. You should be a pro at this
# by now ;-)
#
# QUESTION: Should the id column be included as a
# feature?
#
file = pd.read_csv('Datasets/kidney_disease.csv')
file = file.dropna()
file = file.drop(['id', 'classification', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'], axis = 1)
file.head(5)


Out[7]:
age bp sg al su bgr bu sc sod pot hemo pcv wc rc
3 48.0 70.0 1.005 4.0 0.0 117.0 56.0 3.8 111.0 2.5 11.2 32 6700 3.9
9 53.0 90.0 1.020 2.0 0.0 70.0 107.0 7.2 114.0 3.7 9.5 29 12100 3.7
11 63.0 70.0 1.010 3.0 0.0 380.0 60.0 2.7 131.0 4.2 10.8 32 4500 3.8
14 68.0 80.0 1.010 3.0 2.0 157.0 90.0 4.1 130.0 6.4 5.6 16 11000 2.6
20 61.0 80.0 1.015 2.0 0.0 173.0 148.0 3.9 135.0 5.2 7.7 24 9200 3.2

In [11]:
file.pcv = pd.to_numeric(file.pcv)
file.wc = pd.to_numeric(file.wc)
file.rc = pd.to_numeric(file.rc)
df = file


Out[11]:
age     float64
bp      float64
sg      float64
al      float64
su      float64
bgr     float64
bu      float64
sc      float64
sod     float64
pot     float64
hemo    float64
pcv       int64
wc        int64
rc      float64
dtype: object

In [12]:
# TODO: PCA Operates based on variance. The variable with the greatest
# variance will dominate. Go ahead and peek into your data using a
# command that will check the variance of every feature in your dataset.
# Print out the results. Also print out the results of running .describe
# on your dataset.
#
# Hint: If you don't see all three variables: 'bgr','wc' and 'rc', then
# you probably didn't complete the previous step properly.
df.var()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-0b79260256fd> in <module>()
      7 # Hint: If you don't see all three variables: 'bgr','wc' and 'rc', then
      8 # you probably didn't complete the previous step properly.
----> 9 df.var()

NameError: name 'df' is not defined