In [3]:
%pylab inline
In [46]:
# Import libraries
from __future__ import absolute_import, division, print_function
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('tools/')
import numpy as np
import pandas as pd
import math
# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style("white")
# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)
from IPython.display import display
In [47]:
def sample(num_sample, top):
"""
Create a random sample from a table
Attributes
---------
num_sample: int
top: dataframe
Returns a random subset of table index
"""
df_index = []
for i in np.arange(0, num_sample, 1):
# pick randomly from the whole table
sample_index = np.random.randint(0, len(top))
# store index
df_index.append(sample_index)
return df_index
def sample_no_replacement(num_sample, top):
"""
Create a random sample from a table
Attributes
---------
num_sample: int
top: dataframe
Returns a random subset of table index
"""
df_index = []
lst = np.arange(0, len(top), 1)
for i in np.arange(0, num_sample, 1):
# pick randomly from the whole table
sample_index = np.random.choice(lst)
lst = np.setdiff1d(lst,[sample_index])
df_index.append(sample_index)
return df_index
In [48]:
die = pd.DataFrame()
die["Face"] = [1,2,3,4,5,6]
In [49]:
die
Out[49]:
In [50]:
coin = pd.DataFrame()
coin["Face"] = [1,2]
coin
Out[50]:
We can simulate the act of rolling dice by just pulling out rows
In [51]:
index_ = sample(3, die)
df = die.ix[index_, :]
df
Out[51]:
In [52]:
index_ = sample(1, coin)
df = coin.ix[index_, :]
df
Out[52]:
In [53]:
def sum_draws( n, box ):
"""
Construct histogram for the sum of n draws from a box with replacement
Attributes
-----------
n: int (number of draws)
box: dataframe (the box model)
"""
data = numpy.zeros(shape=(n,1))
if n > 0:
for i in range(n):
index_ = np.random.randint(0, len(box), n)
df = box.ix[index_, :]
data[i] = df.Content.sum()
bins = np.arange(data.min()-0.5, data.max()+1, 1)
pyplt.hist(data, bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.xlabel('Number on ticket')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
else:
raise ValueError('n has to be greater than 0')
In [54]:
box = pd.DataFrame()
box["Content"] = [0,1,2,3,4]
In [55]:
pyplt.rcParams['figure.figsize'] = (4, 3)
In [56]:
sum_draws(100, box)
In [57]:
pyplt.rcParams['figure.figsize'] = (4, 3)
low, high = box.Content.min() - 0.5, box.Content.max() + 1
bins = np.arange(low, high, 1)
box.plot.hist(bins=bins, normed=True)
pyplt.ylabel('percent per unit')
pyplt.xlabel('Number on ticket')
pyplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
In [58]:
sum_draws(1000, box)
The law of averages states that as the number of draws increases, so too does the difference between the expected average versus the observed average. $$ Chance \ Error = Observed - Expected $$ In the case of coin tosses, as the number of tosses goes up, so does the absolute chance error.
In [59]:
def number_of_heads( n, box ):
"""
The number of heads in n tosses
Attributes
-----------
n: int (number of draws)
box: dataframe (the coin box model)
"""
data = numpy.zeros(shape=(n,1))
if n > 0:
value = np.random.randint(0, len(box), n)
data = value
else:
raise ValueError('n has to be greater than 0')
return data.sum()
In [60]:
box = pd.DataFrame()
box["Content"] = [0,1]
In [78]:
low, high, step = 100, 10000, 2
length = len(range(low, high, step))
num_tosses = numpy.zeros(shape=(length,1))
num_heads = numpy.zeros(shape=(length,1))
chance_error = numpy.zeros(shape=(length,1))
percentage_difference = numpy.zeros(shape=(length,1))
i= 0
for n in range(low, high, step):
observed = number_of_heads(n, box)
expected = n//2
num_tosses[i] = n
num_heads[i] = observed
chance_error[i] = math.fabs(expected - observed)
percentage_difference[i] = math.fabs(((num_heads[i] / num_tosses[i]) * 100) - 50)
i += 1
In [79]:
avg_heads = pd.DataFrame(index= range(low, high, step) )
avg_heads['num_tosses'] = num_tosses
avg_heads['num_heads'] = num_heads
avg_heads['chance_error'] = chance_error
avg_heads['percentage_difference'] = percentage_difference
avg_heads.reset_index(inplace=True)
In [107]:
pyplt.rcParams['figure.figsize'] = (8, 3)
pyplt.plot(avg_heads.chance_error, 'ro', markersize=1)
pyplt.ylim(-50, 500)
pyplt.title('Modeling the Law of Averages')
pyplt.ylabel('Difference between \nObserved versus Expected')
pyplt.xlabel('Number of Tosses');
In [114]:
pyplt.rcParams['figure.figsize'] = (8, 4)
ax = pyplt.plot(avg_heads.percentage_difference, 'bo', markersize=1)
pyplt.ylim(-5, 20)
pyplt.ylabel('The Percentage Difference\n Between Observed and Expected')
pyplt.xlabel('Number of Tosses');
In [42]:
pyplt.rcParams['figure.figsize'] = (4, 3)
In [ ]: