In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import assignment2_helper as helper
# Look pretty...
# matplotlib.style.use('ggplot')
plt.style.use('ggplot')
# Do * NOT * alter this line, until instructed!
scaleFeatures = False
# Create some color coded labels; the actual label feature
# will be removed prior to executing PCA, since it's unsupervised.
# You're only labeling by color so you can see the effects of PCA
labels = ['red' if i=='ckd' else 'green' for i in df.classification]
In [7]:
# TODO: Load up the dataset and remove any and all
# Rows that have a nan. You should be a pro at this
# by now ;-)
#
# QUESTION: Should the id column be included as a
# feature?
#
file = pd.read_csv('Datasets/kidney_disease.csv')
file = file.dropna()
file = file.drop(['id', 'classification', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'], axis = 1)
file.head(5)
Out[7]:
In [11]:
file.pcv = pd.to_numeric(file.pcv)
file.wc = pd.to_numeric(file.wc)
file.rc = pd.to_numeric(file.rc)
df = file
Out[11]:
In [12]:
# TODO: PCA Operates based on variance. The variable with the greatest
# variance will dominate. Go ahead and peek into your data using a
# command that will check the variance of every feature in your dataset.
# Print out the results. Also print out the results of running .describe
# on your dataset.
#
# Hint: If you don't see all three variables: 'bgr','wc' and 'rc', then
# you probably didn't complete the previous step properly.
df.var()