In [1]:
%matplotlib inline
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
In [2]:
fname = "https://raw.githubusercontent.com/Bio204-class/bio204-datasets/master/births.txt"
births = pd.read_table(fname)
In [3]:
births.head()
Out[3]:
In [4]:
# get unique element from a given column
births.premature.unique()
Out[4]:
In [5]:
births.smoke.unique()
Out[5]:
In [6]:
# total number of "null" (NA, NaN) elements in a given column
births.visits.isnull().sum()
Out[6]:
In [7]:
# total number of null elements for each column
births.isnull().sum()
Out[7]:
In [8]:
# ispremie and isfullterm are boolean pandas.Series whose length is the number
# of rows in births
ispremie = births.premature == "premie"
isfullterm = births.premature == "full term"
# can get number of Trues by summing boolean Series
nfullterm = isfullterm.sum()
npremie = ispremie.sum()
print("Full term births:", nfullterm)
print("Premature births:", npremie)
In [9]:
isgirl = births.sexBaby == "female"
isboy = births.sexBaby == "male"
# index into the birth DataFrame w/boolean Series, return new DataFrames
babyGirls = births[isgirl]
babyBoys = births[isboy]
print("Baby girls:", len(babyGirls))
print("Baby boys:", len(babyBoys))
In [10]:
# pandas.Series and numpy.array support bitwise and (&) and bitwise or (|) operators
premieGirls = births[isgirl & ispremie]
premieBoys = births[isboy & ispremie]
print("Premie girls:", len(premieGirls))
print("Premie boys:", len(premieBoys))
In [11]:
# use of bitwise or to get premature OR mother smoked
momsmoked = births.smoke == "smoker"
premieOrMomSmoked = births[ispremie | momsmoked]
print("Premie or Mom Smoked:", len(premieOrMomSmoked))
In [12]:
# demonstrating bitwise operations on numpy arrays
a = np.array([True, False, True])
b = np.array([False, False, True])
print("a =", a)
print("b =", b)
print("bitwise and, a & b = ", a & b)
print("bitwise or, a | b = ", a | b)
In [13]:
premieAndSmoke = births.query('(premature == "premie") and (smoke == "smoker")')
premieAndSmoke
Out[13]:
In [14]:
termgroup = births.groupby("premature")
In [15]:
# apply an summary function w/respect to the grouping
termgroup.describe()
Out[15]:
In [16]:
type(termgroup)
Out[16]:
In [17]:
# the groupby object has a groups dictionary associated with it
type(termgroup.groups)
Out[17]:
In [18]:
# the keys of this dictionary are the actual grouping terms
termgroup.groups.keys()
Out[18]:
In [19]:
# get a specific group
premies = termgroup.get_group('premie')
type(premies),premies.shape
Out[19]:
In [20]:
term_and_smoke_group = births.groupby(["premature", "smoke"])
In [21]:
term_and_smoke_group.groups.keys()
Out[21]:
In [22]:
term_and_smoke_group.weight.describe()
Out[22]:
Up to now we've been mostly using what is a "function based" approach to Matlotlib, where we build up a plot by a series of function calls. This works pretty well, but starts to break down when we want to create more complicated plots. There's another way to approach plot-building in Matplotlib, built around manipulation of figure
and axis
objects. I demonstrate these two styles of plot-building below.
In [23]:
plt.scatter(births.mAge, births.fAge)
plt.xlabel("Age of Mother")
plt.ylabel("Age of Father")
plt.title("Relationship between Age of Parents\nBased on 150 births from NC")
pass
In [24]:
# create just the figure and axis objects
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # left, bottom, width, height
In [25]:
# create our figure and axis objects
fig = plt.figure()
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # left, bottom, width, height
# change/add features of axis
axes.scatter(births.mAge, births.fAge)
axes.set_xlabel("Age of Mother")
axes.set_ylabel("Age of Father")
axes.set_title("Relationship between Age of Parents\nBased on 150 births from NC")
pass
Above we recreated the scatter plot using the object oriented approach, but the code was somewhat more verbose. However, the object oriented approach really starts to shine when we want to start tweaking the figure and adding new elements. Here I'll recreate the scatter plot and then add two new axes, depicting the marginal histograms on the top and right side.
In [26]:
# create our figure and axis objects
fig = plt.figure(figsize=(6,6))
# note that we've made the main axis take up less of the total figure
axes = fig.add_axes([0.1, 0.1, 0.5, 0.5]) # left, bottom, width, height
# change/add features of axis
axes.scatter(births.mAge, births.fAge)
axes.set_xlabel("Age of Mother")
axes.set_ylabel("Age of Father")
axes.set_xlim(10,50)
axes.set_ylim(10,50)
# add new axis on right of figure to draw histogram of father's age
# I figured these out the coordinates and width by sketching and trial and error
right = fig.add_axes([0.7, 0.1, 0.15, 0.5])
right.hist(births.fAge[births.fAge.notnull()], orientation="horizontal", normed=True)
right.set_xticks([])
right.set_ylim(10,50) # for figure to be accurate, these limits must match main figure limits
# add new axis on top of figure to draw histogram of mother's age
above = fig.add_axes([0.1, 0.7, 0.5, 0.15])
above.hist(births.fAge[births.fAge.notnull()], orientation="vertical", normed=True)
above.set_yticks([])
above.set_xlim(10,50)
pass
For more details and a longer exposition of this object oriented approach to matplotlib see this tutorial by J. R. Johannson.
In [27]:
import seaborn as sbn
In [28]:
sbn.distplot(births.weight, color='r')
pass
In [29]:
# Plot a histogram with a kernel density estimate (kde)
ax = sns.distplot(births.weight, color="r")
ax.set_ylabel("Density")
pass
In [30]:
# Plot a kde with a rug plot
ax = sns.distplot(births.weight, hist=False, rug=True, color="r")
ax.set_ylabel("Density")
pass
Kernel Density Estimates have a parameter called "band-width" which controls the degree of smoothing.
In [31]:
sns.kdeplot(births.weight, bw=0.1, label='bw = 0.1')
sns.kdeplot(births.weight, bw=0.25, label='bw = 0.25')
sns.kdeplot(births.weight, bw=0.5, label='bw = 0.5')
sns.rugplot(births.weight, color='black')
plt.xlabel("Weight")
plt.ylabel("Density")
pass
In [34]:
sns.set(style="whitegrid", palette="pastel", color_codes=True)
sns.violinplot(x="smoke", y="weight", hue="premature", data=births, split=True,
inner='quartile',
palette={"full term": "b", "premie": "y"})
plt.legend(loc='upper center')
pass
In [ ]: