In [1]:
%pylab inline
In [41]:
# Import libraries
from __future__ import absolute_import, division, print_function
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('tools/')
import numpy as np
import pandas as pd
# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style("white")
# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)
from IPython.display import display
To get our feet wet, let's take a look at the city of San Francisco's bike sharing data
In [3]:
trips = pd.read_csv('data/trip.csv')
commute = trips.where(trips.Duration < 1800)
In [4]:
display(trips.head())
display(commute.head())
For this analysis, we are limiting ourselves to bike sharing trips that are leess that 30 minutes, because that is what comes free as part of the bike sharing program for the city of San Francisco. We want to draw a histogram of the duration to understand exactly how long trips typically take.
To do that, I am going to start off making bins of the data. For example, restricting the sampling to every minutes.
In [5]:
half_hour = 30 * 60 # 30 minutes times 60 seconds
bins = np.arange(1, half_hour+1, 60)
In [6]:
pyplt.rcParams['figure.figsize'] = (4, 3)
In [7]:
commute.hist('Duration', bins=bins, normed=True)
pyplt.ylabel('percent per unit');
In [8]:
def bin_frequency(k):
bins = np.arange(1, half_hour+1, k)
commute.hist('Duration', bins=bins, normed=True)
In [9]:
bin_frequency(10)
pyplt.ylabel('percent per unit');
In [10]:
weather = pd.read_csv('data/weather.csv')
sf = weather.where(weather.Zip == 94107)
In [11]:
sf = sf[[0, 1, 3]]
sf.columns = [u'PDT', 'High', 'Low']
In [12]:
sf.head()
Out[12]:
In [13]:
def axis_tick_frequency(ax, axis, freq):
"""The frequency of the y axis tick marks
Attributes
----------
ax: matplotlib axis object
axis: char eithher 'y' or 'x'
freq: int, the integer value of which the range moves
"""
if axis == 'y':
start, end = ax.get_ylim()
ax.yaxis.set_ticks(np.arange(start, end, freq))
elif axis == 'x':
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(start, end, freq))
else:
raise ValueError('{argument} is not a valid axis object'.format(argument=repr(axis)))
In [14]:
ax = sf[[1,2]].plot.hist(color =['coral','cadetblue'], bins=np.arange(30, 101, 5), normed=True, alpha = 0.5)
pyplt.ylabel('percent per unit')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pyplt.grid(True);
In [15]:
sf.where(sf.Low >= 45).Low.count() / 365 * 100
Out[15]:
In [16]:
sf['diff'] = sf.High - sf.Low
In [17]:
ax = sf['diff'].plot.hist(bins=np.arange(0, 40, 1))
axis_tick_frequency(ax, 'y', 10)
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
In [18]:
top = pd.read_csv('data/top_movies.csv')
In [19]:
top.head()
Out[19]:
Let's make a deterministic sample. We will sample every 20th top grossing movie
In [20]:
interval = 20
sample = arange(0, len(top), interval)
In [21]:
df = pd.DataFrame()
for i in sample:
df = df.append(top.ix[i, :])
In [22]:
df
Out[22]:
A population is the set of all elements from whom a sample will be drawn. A probability sample is one for which it is possible to calculate, before the sample is drawn, the chance with which any subset of elements will enter the sample. In a probability sample, all elements need not have the same chance of being chosen.
Now let's do probabilistic sampling of the top movies. When we do a probabilistic sample like this, we call it a "systematic sample."
In [23]:
start = np.random.choice(np.arange(interval))
sample = np.arange(start, len(top), interval)
df = pd.DataFrame()
for i in sample:
df = df.append(top.ix[i, :])
In [24]:
df
Out[24]:
In [25]:
def sample(num_sample, top):
"""
Create a random sample from a table
Attributes
---------
num_sample: int
top: dataframe
Returns a random subset of table index
"""
df_index = []
for i in np.arange(0, num_sample, 1):
# pick randomly from the whole table
sample_index = np.random.randint(0, len(top))
# store index
df_index.append(sample_index)
return df_index
def sample_no_replacement(num_sample, top):
"""
Create a random sample from a table
Attributes
---------
num_sample: int
top: dataframe
Returns a random subset of table index
"""
df_index = []
lst = np.arange(0, len(top), 1)
for i in np.arange(0, num_sample, 1):
# pick randomly from the whole table
sample_index = np.random.choice(lst)
lst = np.setdiff1d(lst,[sample_index])
df_index.append(sample_index)
return df_index
In [28]:
index_ = sample(35, top)
df = top.ix[index_, :]
df.sort_values(by='Title')
Out[28]:
In [29]:
index_ = sample_no_replacement(25, top)
df = top.ix[index_, :]
df.sort_values(by='Title')
Out[29]:
In [30]:
die = pd.DataFrame()
die["Face"] = [1,2,3,4,5,6]
In [31]:
die
Out[31]:
In [32]:
coin = pd.DataFrame()
coin["Face"] = [1,2]
coin
Out[32]:
We can simulate the act of rolling dice by just pulling out rows
In [33]:
index_ = sample(3, die)
df = die.ix[index_, :]
df
Out[33]:
In [34]:
index_ = sample(1, coin)
df = coin.ix[index_, :]
df
Out[34]:
In [35]:
def dice_hist(n):
"""Construct histogram of n simulated dice rolls
Attributes
-----------
n: int
"""
if n > 0:
dice_bins = np.arange(0.5, 7, 1)
index_ = sample(n, die)
df = die.ix[index_, :]
df.plot.hist(bins=dice_bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
else:
raise ValueError('n has to be greater than 0')
def dice_sum_hist(n):
"""
Construct histogram of rolling a pair of dice and plot the sum of the faces
Attributes
-----------
num_die: int (number of dice)
n: int
"""
if n > 0:
d1 = np.random.randint(1, 6 + 1, n)
d2 = np.random.randint(1, 6 + 1, n)
data = d1 + d2
bins = np.arange(data.min()-0.5, data.max()+1, 1)
pyplt.hist(data, bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
else:
raise ValueError('n has to be greater than 0')
def dice_prod_hist(n):
"""
Construct histogram of rolling a pair of dice and plotting the product of the faces.
Attributes
-----------
num_die: int (number of dice)
n: int
"""
if n > 0:
d1 = np.random.randint(1, 6 + 1, n)
d2 = np.random.randint(1, 6 + 1, n)
data = d1 * d2
bins = np.arange(data.min()-0.5, data.max()+1, 1)
pyplt.hist(data, bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
else:
raise ValueError('n has to be greater than 0')
In [59]:
low, high = coin.Face.min() - 0.5, coin.Face.max() + 1
bins = np.arange(low, high, 1)
# norm the histogram to give us the density scale
coin.plot.hist(bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
In [60]:
low, high = die.Face.min() - 0.5, die.Face.max() + 1
bins = np.arange(low, high, 1)
# norm the histogram to give us the density scale
die.plot.hist(bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
In [38]:
dice_hist(20)
In [39]:
dice_hist(10000)
Plot the sum of dices
In [44]:
n = 100
dice_sum_hist(n)
Roll two six sided dice 1000 times and sum the results:
In [47]:
n = 1000
dice_sum_hist(n)
The graph below helps us see that the probability histograms can have gaps. The graph below plots the probability histogram of the product of a pair of dice. The smallest number is 1, while the largest number is 36. Because of the distribution of the die face, we can never get a 7, 11 and so on.
In [48]:
n = 100
dice_prod_hist(n)
In [56]:
box = pd.DataFrame()
box["Content"] = [1,2,3]
In [57]:
low, high = box.Content.min() - 0.5, box.Content.max() + 1
bins = np.arange(low, high, 1)
box.plot.hist(bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
In [51]:
def sum_draws( n, box ):
"""
Construct histogram for the sum of n draws from a box with replacement
Attributes
-----------
n: int (number of draws)
box: dataframe (the box model)
"""
data = numpy.zeros(shape=(n,1))
if n > 0:
for i in range(n):
index_ = np.random.randint(0, len(box), n)
df = box.ix[index_, :]
data[i] = df.Content.sum()
bins = np.arange(data.min()-0.5, data.max()+1, 1)
pyplt.hist(data, bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.xlabel('Number on ticket')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
else:
raise ValueError('n has to be greater than 0')
In [61]:
sum_draws(100, box)
In [71]:
box = pd.DataFrame()
box["Content"] = [0,1]
In [72]:
pyplt.rcParams['figure.figsize'] = (4, 3)
low, high = box.Content.min() - 0.5, box.Content.max() + 1
bins = np.arange(low, high, 1)
box.plot.hist(bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.xlabel('Number on ticket')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
In [76]:
sum_draws(10000, box)