In [3]:
import pandas as pd # pandas is a dataframe library
import matplotlib.pyplot as plt # matplotlib.pyplot plots data
import numpy as np # numpy provides N-dim object support
# do ploting inline instead of in a separate window
%matplotlib inline
In [5]:
df = pd.read_csv("./data/pima-data.csv") # load Pima data. Adjust path as necessary
In [6]:
df.shape
Out[6]:
In [7]:
df.head(5)
Out[7]:
In [8]:
df.tail(5)
Out[8]:
From the metadata on the data source we have the following definition of the features.
Feature | Description | Comments |
---|---|---|
num_preg | number of pregnancies | |
glucose_conc | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | |
diastolic_bp | Diastolic blood pressure (mm Hg) | |
thickness | Triceps skin fold thickness (mm) | |
insulin | 2-Hour serum insulin (mu U/ml) | |
bmi | Body mass index (weight in kg/(height in m)^2) | |
diab_pred | Diabetes pedigree function | |
Age (years) | Age (years) | |
skin | ???? | What is this? |
diabetes | Class variable (1=True, 0=False) | Why is our data boolean (True/False)? |
Pandas makes it easy to see if there are any null values in the data frame. The isnull() method will check each value in the data frame for null values, and then .any() will return if any nulls are found.
In [32]:
df.isnull().values.any()
Out[32]:
In [15]:
def plot_corr(df, size=10):
"""
Function plots a graphical correlation matrix for each pair of columns in the dataframe.
Input:
df: pandas DataFrame
size: vertical and horizontal size of the plot
Displays:
matrix of correlation between columns. Blue-cyan-yellow-red-darkred => less to more correlated
0 ------------------> 1
Expect a darkred line running from top left to bottom right
"""
corr = df.corr() # data frame correlation function
fig, ax = plt.subplots(figsize=(size, size))
ax.matshow(corr) # color code the rectangles by correlation value
plt.xticks(range(len(corr.columns)), corr.columns) # draw x tick marks
plt.yticks(range(len(corr.columns)), corr.columns) # draw y tick marks
In [14]:
plot_corr(df)
In [16]:
df.corr()
Out[16]:
In [17]:
df.head()
Out[17]:
In [18]:
del df['skin']
In [19]:
df.head()
Out[19]:
In [25]:
plot_corr(df)
In [21]:
df.head(5)
Out[21]:
Change True to 1, False to 0
In [26]:
diabetes_map = {True : 1, False : 0}
In [27]:
df['diabetes'] = df['diabetes'].map(diabetes_map)
In [28]:
df.head(5)
Out[28]:
In [31]:
num_true = len(df.loc[df['diabetes'] == True])
num_false = len(df.loc[df['diabetes'] == False])
print("Number of True cases: {0} ({1:2.2f}%)".format(num_true, (num_true/ (num_true + num_false)) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/ (num_true + num_false)) * 100))
In [ ]: