In [28]:
import pandas as pd
from pylab import savefig
import seaborn as sns
import plotly.plotly as py
from plotly.graph_objs import *
py.sign_in("sebecketthile", "a649z2ue29")
%matplotlib inline
In [29]:
# Import data into dataframe with read_csv:
df = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv')
group_df = df.groupby('Major_category').mean()
group_df
quartile_df = group_df[['Median','P25th','P75th']].transpose()
quartile_df
sns.set_context('poster')
sns.set_style('whitegrid')
sns.boxplot(quartile_df, vert=False, linewidth=3)
sns.despine()
savefig('majors.png', bbox_inches='tight')
In [14]:
sns.set(font="monospace")
df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)
used_networks = [1, 5, 6, 7, 8, 11, 12, 13, 16, 17]
used_columns = (df.columns.get_level_values("network")
.astype(int)
.isin(used_networks))
df = df.loc[:, used_columns]
network_pal = sns.cubehelix_palette(len(used_networks),
light=.9, dark=.1, reverse=True,
start=1, rot=-2)
network_lut = dict(zip(map(str, used_networks), network_pal))
networks = df.columns.get_level_values("network")
network_colors = pd.Series(networks).map(network_lut)
cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)
sns.clustermap(df.corr(), row_colors=network_colors, method="average",
col_colors=network_colors, figsize=(13, 13), cmap=cmap)
Out[14]:
In [15]:
import numpy as np
from scipy.stats import kendalltau
import seaborn as sns
sns.set(style="ticks")
rs = np.random.RandomState(11)
x = rs.gamma(2, size=1000)
y = -.5 * x + rs.normal(size=1000)
sns.jointplot(x, y, kind="hex", stat_func=kendalltau, color="#4CB391")
Out[15]:
In [30]:
data = Data([
Surface(
z=[[27.80985, 49.61936, 83.08067, 116.6632, 130.414, 150.7206, 220.1871, 156.1536, 148.6416, 203.7845, 206.0386, 107.1618, 68.36975, 45.3359, 49.96142, 21.89279, 17.02552, 11.74317, 14.75226, 13.6671, 5.677561, 3.31234, 1.156517, -0.147662], [27.71966, 48.55022, 65.21374, 95.27666, 116.9964, 133.9056, 152.3412, 151.934, 160.1139, 179.5327, 147.6184, 170.3943, 121.8194, 52.58537, 33.08871, 38.40972, 44.24843, 69.5786, 4.019351, 3.050024, 3.039719, 2.996142, 2.967954, 1.999594], [30.4267, 33.47752, 44.80953, 62.47495, 77.43523, 104.2153, 102.7393, 137.0004, 186.0706, 219.3173, 181.7615, 120.9154, 143.1835, 82.40501, 48.47132, 74.71461, 60.0909, 7.073525, 6.089851, 6.53745, 6.666096, 7.306965, 5.73684, 3.625628], [16.66549, 30.1086, 39.96952, 44.12225, 59.57512, 77.56929, 106.8925, 166.5539, 175.2381, 185.2815, 154.5056, 83.0433, 62.61732, 62.33167, 60.55916, 55.92124, 15.17284, 8.248324, 36.68087, 61.93413, 20.26867, 68.58819, 46.49812, 0.2360095], [8.815617, 18.3516, 8.658275, 27.5859, 48.62691, 60.18013, 91.3286, 145.7109, 116.0653, 106.2662, 68.69447, 53.10596, 37.92797, 47.95942, 47.42691, 69.20731, 44.95468, 29.17197, 17.91674, 16.25515, 14.65559, 17.26048, 31.22245, 46.71704], [6.628881, 10.41339, 24.81939, 26.08952, 30.1605, 52.30802, 64.71007, 76.30823, 84.63686, 99.4324, 62.52132, 46.81647, 55.76606, 82.4099, 140.2647, 81.26501, 56.45756, 30.42164, 17.28782, 8.302431, 2.981626, 2.698536, 5.886086, 5.268358], [21.83975, 6.63927, 18.97085, 32.89204, 43.15014, 62.86014, 104.6657, 130.2294, 114.8494, 106.9873, 61.89647, 55.55682, 86.80986, 89.27802, 122.4221, 123.9698, 109.0952, 98.41956, 77.61374, 32.49031, 14.67344, 7.370775, 0.03711011, 0.6423392], [53.34303, 26.79797, 6.63927, 10.88787, 17.2044, 56.18116, 79.70141, 90.8453, 98.27675, 80.87243, 74.7931, 75.54661, 73.4373, 74.11694, 68.1749, 46.24076, 39.93857, 31.21653, 36.88335, 40.02525, 117.4297, 12.70328, 1.729771, 0], [25.66785, 63.05717, 22.1414, 17.074, 41.74483, 60.27227, 81.42432, 114.444, 102.3234, 101.7878, 111.031, 119.2309, 114.0777, 110.5296, 59.19355, 42.47175, 14.63598, 6.944074, 6.944075, 27.74936, 0, 0, 0.09449376, 0.07732264], [12.827, 69.20554, 46.76293, 13.96517, 33.88744, 61.82613, 84.74799, 121.122, 145.2741, 153.1797, 204.786, 227.9242, 236.3038, 228.3655, 79.34425, 25.93483, 6.944074, 6.944074, 6.944075, 7.553681, 0, 0, 0, 0], [0, 68.66396, 59.0435, 33.35762, 47.45282, 57.8355, 78.91689, 107.8275, 168.0053, 130.9597, 212.5541, 165.8122, 210.2429, 181.1713, 189.7617, 137.3378, 84.65395, 8.677168, 6.956576, 8.468093, 0, 0, 0, 0], [0, 95.17499, 80.03818, 59.89862, 39.58476, 50.28058, 63.81641, 80.61302, 66.37824, 198.7651, 244.3467, 294.2474, 264.3517, 176.4082, 60.21857, 77.41475, 53.16981, 56.16393, 6.949235, 7.531059, 3.780177, 0, 0, 0], [0, 134.9879, 130.3696, 96.86325, 75.70494, 58.86466, 57.20374, 55.18837, 78.128, 108.5582, 154.3774, 319.1686, 372.8826, 275.4655, 130.2632, 54.93822, 25.49719, 8.047439, 8.084393, 5.115252, 5.678269, 0, 0, 0], [0, 48.08919, 142.5558, 140.3777, 154.7261, 87.9361, 58.11092, 52.83869, 67.14822, 83.66798, 118.9242, 150.0681, 272.9709, 341.1366, 238.664, 190.2, 116.8943, 91.48672, 14.0157, 42.29277, 5.115252, 0, 0, 0], [0, 54.1941, 146.3839, 99.48143, 96.19411, 102.9473, 76.14089, 57.7844, 47.0402, 64.36799, 84.23767, 162.7181, 121.3275, 213.1646, 328.482, 285.4489, 283.8319, 212.815, 164.549, 92.29631, 7.244015, 1.167, 0, 0], [0, 6.919659, 195.1709, 132.5253, 135.2341, 89.85069, 89.45549, 60.29967, 50.33806, 39.17583, 59.06854, 74.52159, 84.93402, 187.1219, 123.9673, 103.7027, 128.986, 165.1283, 249.7054, 95.39966, 10.00284, 2.39255, 0, 0], [0, 21.73871, 123.1339, 176.7414, 158.2698, 137.235, 105.3089, 86.63255, 53.11591, 29.03865, 30.40539, 39.04902, 49.23405, 63.27853, 111.4215, 101.1956, 40.00962, 59.84565, 74.51253, 17.06316, 2.435141, 2.287471, -0.0003636982, 0], [0, 0, 62.04672, 136.3122, 201.7952, 168.1343, 95.2046, 58.90624, 46.94091, 49.27053, 37.10416, 17.97011, 30.93697, 33.39257, 44.03077, 55.64542, 78.22423, 14.42782, 9.954997, 7.768213, 13.0254, 21.73166, 2.156372, 0.5317867], [0, 0, 79.62993, 139.6978, 173.167, 192.8718, 196.3499, 144.6611, 106.5424, 57.16653, 41.16107, 32.12764, 13.8566, 10.91772, 12.07177, 22.38254, 24.72105, 6.803666, 4.200841, 16.46857, 15.70744, 33.96221, 7.575688, -0.04880907], [0, 0, 33.2664, 57.53643, 167.2241, 196.4833, 194.7966, 182.1884, 119.6961, 73.02113, 48.36549, 33.74652, 26.2379, 16.3578, 6.811293, 6.63927, 6.639271, 8.468093, 6.194273, 3.591233, 3.81486, 8.600739, 5.21889, 0], [0, 0, 29.77937, 54.97282, 144.7995, 207.4904, 165.3432, 171.4047, 174.9216, 100.2733, 61.46441, 50.19171, 26.08209, 17.18218, 8.468093, 6.63927, 6.334467, 6.334467, 5.666687, 4.272203, 0, 0, 0, 0], [0, 0, 31.409, 132.7418, 185.5796, 121.8299, 185.3841, 160.6566, 116.1478, 118.1078, 141.7946, 65.56351, 48.84066, 23.13864, 18.12932, 10.28531, 6.029663, 6.044627, 5.694764, 3.739085, 3.896037, 0, 0, 0], [0, 0, 19.58994, 42.30355, 96.26777, 187.1207, 179.6626, 221.3898, 154.2617, 142.1604, 148.5737, 67.17937, 40.69044, 39.74512, 26.10166, 14.48469, 8.65873, 3.896037, 3.571392, 3.896037, 3.896037, 3.896037, 1.077756, 0], [0.001229679, 3.008948, 5.909858, 33.50574, 104.3341, 152.2165, 198.1988, 191.841, 228.7349, 168.1041, 144.2759, 110.7436, 57.65214, 42.63504, 27.91891, 15.41052, 8.056102, 3.90283, 3.879774, 3.936718, 3.968634, 0.1236256, 3.985531, -0.1835741], [0, 5.626141, 7.676256, 63.16226, 45.99762, 79.56688, 227.311, 203.9287, 172.5618, 177.1462, 140.4554, 123.9905, 110.346, 65.12319, 34.31887, 24.5278, 9.561069, 3.334991, 5.590495, 5.487353, 5.909499, 5.868994, 5.833817, 3.568177]]
)
])
layout = Layout(
title='Mt Bruno Elevation',
autosize=False,
width=500,
height=500,
margin=Margin(
l=65,
r=50,
b=65,
t=90
)
)
fig = Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='elevations-3d-surface')
In [31]:
x = ['day 1', 'day 1', 'day 1', 'day 1', 'day 1', 'day 1',
'day 2', 'day 2', 'day 2', 'day 2', 'day 2', 'day 2']
trace1 = Box(
y=[0.2, 0.2, 0.6, 1.0, 0.5, 0.4, 0.2, 0.7, 0.9, 0.1, 0.5, 0.3],
x=x,
name='kale',
marker=Marker(
color='#3D9970'
)
)
trace2 = Box(
y=[0.6, 0.7, 0.3, 0.6, 0.0, 0.5, 0.7, 0.9, 0.5, 0.8, 0.7, 0.2],
x=x,
name='radishes',
marker=Marker(
color='#FF4136'
)
)
trace3 = Box(
y=[0.1, 0.3, 0.1, 0.9, 0.6, 0.6, 0.9, 1.0, 0.3, 0.6, 0.8, 0.5],
x=x,
name='carrots',
marker=Marker(
color='#FF851B'
)
)
data = Data([trace1, trace2, trace3])
layout = Layout(
yaxis=YAxis(
title='normalized moisture',
zeroline=False
),
boxmode='group'
)
fig = Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='box-grouped')
Stats Homework
In [27]:
"""
Created on Wed Oct 29 21:33:04 2014
@author: sarahbeckett-hile
"""
import pandas as pd
from math import factorial, pow
import numpy as np
import scipy as sp
#%%
'''
1. Let r be a binomial random variable. Compute Pr(r) for each of the following situations:
'''
# There are lots of good ways to do this
# Either make a couple of lists....
Ns = [10, 4, 16]
thetas = [.2, .4, .7]
Rs = [3, 2, 12]
questions = ['1a', '1b', '1c']
#%%
#...and then write a while loop
i = 0
while i < len(questions):
n = Ns[i] #name the different parts of the equation for easy reading
theta = thetas[i]
r = Rs[i]
factorials = factorial(n) / (factorial(r)*factorial(n-r)) #this is the first chunk of the equation
probability = factorials*pow(theta, r)*pow((1-theta), (n-r))
rounded_prob = round(probability, 3)
print('{}.'.format(questions[i]), rounded_prob)#
i += 1 #add 1 to the value of i to make sure this loop ends. Doing while loops like this can be risky if you forget to add this part
#%%
###...or we could put it in dictionaries and make a DataFrame:
dict1 = {'n': Ns, 'theta': thetas, 'r': Rs} #create a dictionary from the lists
#%%
df = pd.DataFrame(data=dict1, index=questions) #create a DataFrame from the dictionary
#%%
# we'll have to use scipy.misc.factorial instead of math..factorial because math cannot work with a series, which is what we're using when applying functions to Pandas df columns
df['probability'] = (
(sp.misc.factorial(df.n, exact = False)/
(sp.misc.factorial(df.r, exact=False)*sp.misc.factorial(df.n-df.r, exact=False)))
*np.power(df.theta, df.r)*np.power(1-df.theta, df.n-df.r)
)
#%%
# or we could write a function:
def binomial_pdf(n, theta, r):
factorials = factorial(n)/(factorial(r)*factorial(n-r))
probability = factorials*pow(theta, r)*pow((1-theta), (n-r))
return probability
#%%
binomial_pdf(10, .2, 3)
#%%
# finally, the simplest option - use a module that already does this:
# the module we'll use is scipy. List R first, then N and P (probability)
# http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.binom.html
sp.stats.distributions.binom.pmf(3, n = 10, p =.2)
#%%
'''
2. A chain of motels has adopted a policy of giving a 3% discount to customers who pay in cash
rather than by credit cards. Its experience is that 30% of all customers take the discount. Let Y =
number of discount takers among the next 20 customers.
a. Do you think the binomial assumptions are reasonable in this situation?
'''
print('2a. Yes, binomial assumptions are reasonable in this siutation')
#%%
'''
b. Assuming that the binomial probabilities apply, find the probability that exactly 5 of
the next 20 customers take the discount.
'''
# we already have something for this!
print('2b.', binomial_pdf(20, .3, 5))
#%%
'''
c. Find P(5 or fewer customers take the discount).
'''
r_list = range(0,6)
p_5_or_fewer = 0
for r in r_list:
prob = binomial_pdf(20, .3, r)
p_5_or_fewer =+ p_5_or_fewer + prob
print('2c.','Probability that 5 or fewer customers will take the discount:', round(p_5_or_fewer, 3))
#%%
'''
d. What is the most probable number of discount takers in the next 20 customers?
'''
print('2d.', .3*20)
#%%
# or, to replicate the answers given in the assignment's answer key:
N = 20
theta = .3
binom_pdf_df = pd.DataFrame(index=range(0, N+1))
#%%
cumulative = 0
for index, i in binom_pdf_df.iterrows():
probability = binomial_pdf(N, theta, index)
binom_pdf_df.ix[index, 'P(X = x)'] = round(probability, 4)
cumulative += probability
binom_pdf_df.ix[index, 'P(X <= x)'] = round(cumulative, 4)
#%%
print('Binomial with n = {} and p = {}'.format(N, theta))
binom_pdf_df
#%%
# this looks crazy messy, but all it's doing is asking for the value of the index where 'P(X = x)' is greatest
binom_pdf_df.index[binom_pdf_df['P(X = x)']==binom_pdf_df['P(X = x)'].max()].item()
#%%
# All of this can be turned into a new and fairly comprehensive function:
def binom_pdf(n, theta, R=None, Return=None):
df = pd.DataFrame(index=range(0, n+1))
cumulative = 0
for index, i in df.iterrows():
r = index
factorials = factorial(n) / (factorial(r)*factorial(n-r)) # recreating the pdf function from so it can standalone outside this script
probability = factorials*pow(theta, r)*pow((1-theta), (n-r)) # need to use this and it can run independantly: from math import factorial, pow
df.ix[index, 'P(X = x)'] = round(probability, 4)
cumulative += probability
df.ix[index, 'P(X <= x)'] = round(cumulative, 4)
df.ix[index, 'P(X => x)'] = round(1-cumulative, 4)
if R == None:
return df
if R != None:
if Return == None:
return df[df.index == R]
#let's make some plain English options for specifying the value you're looking for with this function
if Return == 'exactly':
return df[df.index == R]['P(X = x)'].item()
if Return == 'at_most' or Return == 'or_fewer' or Return == 'not_more_than' :
return df[df.index == R]['P(X <= x)'].item()
if Return == 'at_least' or Return == 'or_more':
return df[df.index == R-1]['P(X => x)'].item()
if Return == 'less_than' or Return == 'fewer_than':
return df[df.index == R-1]['P(X <= x)'].item()
if Return == 'greater_than' or Return == 'more_than':
return df[df.index == R]['P(X => x)'].item()
return df[df.index == R]
#%%
# since it's optional to submit a number for R, you can leave it out and get a table with all variables
print(binom_pdf(20, .3))
#%%
# or, if you are only interested in a particular value for R:
binom_pdf(20, .3, 5)
#%%
#or, if you want to get really specific:
binom_pdf(20, .3, 5, 'or_fewer')
#%%
'''
3. The admissions office of a small, selective liberal-arts college will only offer admission to
applicants who have a certain mix of accomplishments, including a combined SAT score of 1,400
or more. Based on past records, the head of admissions feels that the probability is 0.66 that an
admitted applicant will come to the college. If 500 applicants are admitted, what is the probability
that 340 or more will come? Note that “340 or more” means the set of values {340, 341, 342,
343, …, 499, 500}.
'''
# this is a great example for when we'd want to use the new binom_pdf function
# Instead of "R or fewer", we want "R or more"
# In other words, we want 'P(X => x)'
# luckily I already threw in this column!
#print binom_pdf(500, .66, 339)['P(X => x)'].item()
print('3a.', binom_pdf(500, .66, 340, 'at_least'))
# Bam, we're done!
#%%
'''
4. Suppose that a full-repair warranty is offered with each new Power-Up foodprocessor. If the
probability that any individual food processor will be returned forneeded warranty repairs within
one year is 0.11, and if a certain store sells 83 of these,find the probabilities that...
a. at most 10 food processors will be returned for warranty repairs;
'''
print('4a.', binom_pdf(83, .11, 10, 'at_most'))
#%%
'''
b. at least 10 food processors will be returned for warranty repairs
'''
print('4b.', binom_pdf(83, .11, 10, 'at_least'))
#%%
'''
c. exactly 10 food processors will be returned for warranty repairs;
'''
print('4c.', binom_pdf(83, .11, 10, 'exactly'))
#%%
'''
d. not more than 15 food processors will be returned for warranty repairs.
'''
print('4d.', binom_pdf(83, .11, 15, 'not_more_than'))
#%%
'''
5. The rate of home sales at a small real estate agency is 1.3 per day. We’ll assume that a Poisson
phenomenon can represent these home sales.
a. Find the probability that no homes will be sold on Monday.
'''
# First, let's write how how we'd do this if there was no existing module for Poisson probability
# we'll still cheat and use scipy to do exp()
# http://docs.scipy.org/doc/numpy/reference/routines.math.html
mean = 1.3
r = 0
poisson_probability = sp.exp(-mean) * (pow(mean,r)/factorial(r))
print('5a. part 1:', poisson_probability)
#%%
# fortunately, a fuction for Poisson probability does exists in scipy
# http://stackoverflow.com/questions/280797/calculate-poisson-probability-percentage
no_sales = sp.stats.distributions.poisson.pmf(0, 1.3)
print('5a. part 2:', no_sales)
#%%
'''
b. Find the probability that one home will be sold on Monday.
'''
one_sale = sp.stats.distributions.poisson.pmf(1, 1.3)
print('5b.', one_sale)
#%%
'''
c. Find the probability that two homes will be sold on Monday.
'''
print('5c.', sp.stats.distributions.poisson.pmf(2, 1.3))
#%%
'''
d. Find the probability that more than two homes will be sold on Monday
'''
no_sales = sp.stats.distributions.poisson.pmf(0, 1.3)
one_sale = sp.stats.distributions.poisson.pmf(1, 1.3)
two_sales = sp.stats.distributions.poisson.pmf(2, 1.3)
more_than_two_sales = 1 - no_sales - one_sale - two_sales
print('5d.', more_than_two_sales)
In [ ]: