notebook.community

Edit and run



In [28]:

    
import pandas as pd
from pylab import savefig
import seaborn as sns
import plotly.plotly as py
from plotly.graph_objs import *
py.sign_in("sebecketthile", "a649z2ue29")
%matplotlib inline



In [29]:

    
# Import data into dataframe with read_csv:
df = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv')


group_df = df.groupby('Major_category').mean()
group_df

quartile_df = group_df[['Median','P25th','P75th']].transpose()
quartile_df

sns.set_context('poster')
sns.set_style('whitegrid')
sns.boxplot(quartile_df,  vert=False, linewidth=3)
sns.despine()
savefig('majors.png', bbox_inches='tight')



In [14]:

    
sns.set(font="monospace")

df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)
used_networks = [1, 5, 6, 7, 8, 11, 12, 13, 16, 17]
used_columns = (df.columns.get_level_values("network")
                          .astype(int)
                          .isin(used_networks))
df = df.loc[:, used_columns]

network_pal = sns.cubehelix_palette(len(used_networks),
                                    light=.9, dark=.1, reverse=True,
                                    start=1, rot=-2)
network_lut = dict(zip(map(str, used_networks), network_pal))

networks = df.columns.get_level_values("network")
network_colors = pd.Series(networks).map(network_lut)

cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)

sns.clustermap(df.corr(), row_colors=network_colors, method="average",
               col_colors=network_colors, figsize=(13, 13), cmap=cmap)









    Out[14]:





<seaborn.matrix.ClusterGrid at 0x10c62d860>



In [15]:

    
import numpy as np
from scipy.stats import kendalltau
import seaborn as sns
sns.set(style="ticks")

rs = np.random.RandomState(11)
x = rs.gamma(2, size=1000)
y = -.5 * x + rs.normal(size=1000)

sns.jointplot(x, y, kind="hex", stat_func=kendalltau, color="#4CB391")









    Out[15]:





<seaborn.axisgrid.JointGrid at 0x10c6496d8>



In [30]:

    
data = Data([
    Surface(
        z=[[27.80985, 49.61936, 83.08067, 116.6632, 130.414, 150.7206, 220.1871, 156.1536, 148.6416, 203.7845, 206.0386, 107.1618, 68.36975, 45.3359, 49.96142, 21.89279, 17.02552, 11.74317, 14.75226, 13.6671, 5.677561, 3.31234, 1.156517, -0.147662], [27.71966, 48.55022, 65.21374, 95.27666, 116.9964, 133.9056, 152.3412, 151.934, 160.1139, 179.5327, 147.6184, 170.3943, 121.8194, 52.58537, 33.08871, 38.40972, 44.24843, 69.5786, 4.019351, 3.050024, 3.039719, 2.996142, 2.967954, 1.999594], [30.4267, 33.47752, 44.80953, 62.47495, 77.43523, 104.2153, 102.7393, 137.0004, 186.0706, 219.3173, 181.7615, 120.9154, 143.1835, 82.40501, 48.47132, 74.71461, 60.0909, 7.073525, 6.089851, 6.53745, 6.666096, 7.306965, 5.73684, 3.625628], [16.66549, 30.1086, 39.96952, 44.12225, 59.57512, 77.56929, 106.8925, 166.5539, 175.2381, 185.2815, 154.5056, 83.0433, 62.61732, 62.33167, 60.55916, 55.92124, 15.17284, 8.248324, 36.68087, 61.93413, 20.26867, 68.58819, 46.49812, 0.2360095], [8.815617, 18.3516, 8.658275, 27.5859, 48.62691, 60.18013, 91.3286, 145.7109, 116.0653, 106.2662, 68.69447, 53.10596, 37.92797, 47.95942, 47.42691, 69.20731, 44.95468, 29.17197, 17.91674, 16.25515, 14.65559, 17.26048, 31.22245, 46.71704], [6.628881, 10.41339, 24.81939, 26.08952, 30.1605, 52.30802, 64.71007, 76.30823, 84.63686, 99.4324, 62.52132, 46.81647, 55.76606, 82.4099, 140.2647, 81.26501, 56.45756, 30.42164, 17.28782, 8.302431, 2.981626, 2.698536, 5.886086, 5.268358], [21.83975, 6.63927, 18.97085, 32.89204, 43.15014, 62.86014, 104.6657, 130.2294, 114.8494, 106.9873, 61.89647, 55.55682, 86.80986, 89.27802, 122.4221, 123.9698, 109.0952, 98.41956, 77.61374, 32.49031, 14.67344, 7.370775, 0.03711011, 0.6423392], [53.34303, 26.79797, 6.63927, 10.88787, 17.2044, 56.18116, 79.70141, 90.8453, 98.27675, 80.87243, 74.7931, 75.54661, 73.4373, 74.11694, 68.1749, 46.24076, 39.93857, 31.21653, 36.88335, 40.02525, 117.4297, 12.70328, 1.729771, 0], [25.66785, 63.05717, 22.1414, 17.074, 41.74483, 60.27227, 81.42432, 114.444, 102.3234, 101.7878, 111.031, 119.2309, 114.0777, 110.5296, 59.19355, 42.47175, 14.63598, 6.944074, 6.944075, 27.74936, 0, 0, 0.09449376, 0.07732264], [12.827, 69.20554, 46.76293, 13.96517, 33.88744, 61.82613, 84.74799, 121.122, 145.2741, 153.1797, 204.786, 227.9242, 236.3038, 228.3655, 79.34425, 25.93483, 6.944074, 6.944074, 6.944075, 7.553681, 0, 0, 0, 0], [0, 68.66396, 59.0435, 33.35762, 47.45282, 57.8355, 78.91689, 107.8275, 168.0053, 130.9597, 212.5541, 165.8122, 210.2429, 181.1713, 189.7617, 137.3378, 84.65395, 8.677168, 6.956576, 8.468093, 0, 0, 0, 0], [0, 95.17499, 80.03818, 59.89862, 39.58476, 50.28058, 63.81641, 80.61302, 66.37824, 198.7651, 244.3467, 294.2474, 264.3517, 176.4082, 60.21857, 77.41475, 53.16981, 56.16393, 6.949235, 7.531059, 3.780177, 0, 0, 0], [0, 134.9879, 130.3696, 96.86325, 75.70494, 58.86466, 57.20374, 55.18837, 78.128, 108.5582, 154.3774, 319.1686, 372.8826, 275.4655, 130.2632, 54.93822, 25.49719, 8.047439, 8.084393, 5.115252, 5.678269, 0, 0, 0], [0, 48.08919, 142.5558, 140.3777, 154.7261, 87.9361, 58.11092, 52.83869, 67.14822, 83.66798, 118.9242, 150.0681, 272.9709, 341.1366, 238.664, 190.2, 116.8943, 91.48672, 14.0157, 42.29277, 5.115252, 0, 0, 0], [0, 54.1941, 146.3839, 99.48143, 96.19411, 102.9473, 76.14089, 57.7844, 47.0402, 64.36799, 84.23767, 162.7181, 121.3275, 213.1646, 328.482, 285.4489, 283.8319, 212.815, 164.549, 92.29631, 7.244015, 1.167, 0, 0], [0, 6.919659, 195.1709, 132.5253, 135.2341, 89.85069, 89.45549, 60.29967, 50.33806, 39.17583, 59.06854, 74.52159, 84.93402, 187.1219, 123.9673, 103.7027, 128.986, 165.1283, 249.7054, 95.39966, 10.00284, 2.39255, 0, 0], [0, 21.73871, 123.1339, 176.7414, 158.2698, 137.235, 105.3089, 86.63255, 53.11591, 29.03865, 30.40539, 39.04902, 49.23405, 63.27853, 111.4215, 101.1956, 40.00962, 59.84565, 74.51253, 17.06316, 2.435141, 2.287471, -0.0003636982, 0], [0, 0, 62.04672, 136.3122, 201.7952, 168.1343, 95.2046, 58.90624, 46.94091, 49.27053, 37.10416, 17.97011, 30.93697, 33.39257, 44.03077, 55.64542, 78.22423, 14.42782, 9.954997, 7.768213, 13.0254, 21.73166, 2.156372, 0.5317867], [0, 0, 79.62993, 139.6978, 173.167, 192.8718, 196.3499, 144.6611, 106.5424, 57.16653, 41.16107, 32.12764, 13.8566, 10.91772, 12.07177, 22.38254, 24.72105, 6.803666, 4.200841, 16.46857, 15.70744, 33.96221, 7.575688, -0.04880907], [0, 0, 33.2664, 57.53643, 167.2241, 196.4833, 194.7966, 182.1884, 119.6961, 73.02113, 48.36549, 33.74652, 26.2379, 16.3578, 6.811293, 6.63927, 6.639271, 8.468093, 6.194273, 3.591233, 3.81486, 8.600739, 5.21889, 0], [0, 0, 29.77937, 54.97282, 144.7995, 207.4904, 165.3432, 171.4047, 174.9216, 100.2733, 61.46441, 50.19171, 26.08209, 17.18218, 8.468093, 6.63927, 6.334467, 6.334467, 5.666687, 4.272203, 0, 0, 0, 0], [0, 0, 31.409, 132.7418, 185.5796, 121.8299, 185.3841, 160.6566, 116.1478, 118.1078, 141.7946, 65.56351, 48.84066, 23.13864, 18.12932, 10.28531, 6.029663, 6.044627, 5.694764, 3.739085, 3.896037, 0, 0, 0], [0, 0, 19.58994, 42.30355, 96.26777, 187.1207, 179.6626, 221.3898, 154.2617, 142.1604, 148.5737, 67.17937, 40.69044, 39.74512, 26.10166, 14.48469, 8.65873, 3.896037, 3.571392, 3.896037, 3.896037, 3.896037, 1.077756, 0], [0.001229679, 3.008948, 5.909858, 33.50574, 104.3341, 152.2165, 198.1988, 191.841, 228.7349, 168.1041, 144.2759, 110.7436, 57.65214, 42.63504, 27.91891, 15.41052, 8.056102, 3.90283, 3.879774, 3.936718, 3.968634, 0.1236256, 3.985531, -0.1835741], [0, 5.626141, 7.676256, 63.16226, 45.99762, 79.56688, 227.311, 203.9287, 172.5618, 177.1462, 140.4554, 123.9905, 110.346, 65.12319, 34.31887, 24.5278, 9.561069, 3.334991, 5.590495, 5.487353, 5.909499, 5.868994, 5.833817, 3.568177]]
    )
])
layout = Layout(
    title='Mt Bruno Elevation',
    autosize=False,
    width=500,
    height=500,
    margin=Margin(
        l=65,
        r=50,
        b=65,
        t=90
    )
)
fig = Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='elevations-3d-surface')



In [31]:

    
x = ['day 1', 'day 1', 'day 1', 'day 1', 'day 1', 'day 1',
     'day 2', 'day 2', 'day 2', 'day 2', 'day 2', 'day 2']

trace1 = Box(
    y=[0.2, 0.2, 0.6, 1.0, 0.5, 0.4, 0.2, 0.7, 0.9, 0.1, 0.5, 0.3],
    x=x,
    name='kale',
    marker=Marker(
        color='#3D9970'
    )
)
trace2 = Box(
    y=[0.6, 0.7, 0.3, 0.6, 0.0, 0.5, 0.7, 0.9, 0.5, 0.8, 0.7, 0.2],
    x=x,
    name='radishes',
    marker=Marker(
        color='#FF4136'
    )
)
trace3 = Box(
    y=[0.1, 0.3, 0.1, 0.9, 0.6, 0.6, 0.9, 1.0, 0.3, 0.6, 0.8, 0.5],
    x=x,
    name='carrots',
    marker=Marker(
        color='#FF851B'
    )
)
data = Data([trace1, trace2, trace3])
layout = Layout(
    yaxis=YAxis(
        title='normalized moisture',
        zeroline=False
    ),
    boxmode='group'
)
fig = Figure(data=data, layout=layout)
plot_url = py.plot(fig, filename='box-grouped')

Stats Homework



In [27]:

    
"""
Created on Wed Oct 29 21:33:04 2014
@author: sarahbeckett-hile
"""
import pandas as pd
from math import factorial, pow
import numpy as np
import scipy as sp
#%%
'''
 1. Let r be a binomial random variable. Compute Pr(r) for each of the following situations:
'''
# There are lots of good ways to do this
# Either make a couple of lists....
Ns = [10, 4, 16]
thetas = [.2, .4, .7]
Rs = [3, 2, 12]
questions = ['1a', '1b', '1c']
#%%
#...and then write a while loop
i = 0
while i < len(questions):
    n = Ns[i] #name the different parts of the equation for easy reading
    theta = thetas[i]
    r = Rs[i]
    factorials = factorial(n) / (factorial(r)*factorial(n-r)) #this is the first chunk of the equation
    probability = factorials*pow(theta, r)*pow((1-theta), (n-r))
    rounded_prob = round(probability, 3)
    print('{}.'.format(questions[i]), rounded_prob)# 
    i += 1 #add 1 to the value of i to make sure this loop ends. Doing while loops like this can be risky if you forget to add this part
#%%
###...or we could put it in dictionaries and make a DataFrame:
dict1 = {'n': Ns, 'theta': thetas, 'r': Rs} #create a dictionary from the lists
#%%
df = pd.DataFrame(data=dict1, index=questions) #create a DataFrame from the dictionary
#%%
# we'll have to use scipy.misc.factorial instead of math..factorial because math cannot work with a series, which is what we're using when applying functions to Pandas df columns
df['probability'] = ( 
    (sp.misc.factorial(df.n, exact = False)/
    (sp.misc.factorial(df.r, exact=False)*sp.misc.factorial(df.n-df.r, exact=False)))
    *np.power(df.theta, df.r)*np.power(1-df.theta, df.n-df.r)
    )
#%%
# or we could write a function:
def binomial_pdf(n, theta, r):
    factorials = factorial(n)/(factorial(r)*factorial(n-r))
    probability = factorials*pow(theta, r)*pow((1-theta), (n-r))
    return probability
#%%
binomial_pdf(10, .2, 3)
#%%
# finally, the simplest option - use a module that already does this:
# the module we'll use is scipy. List R first, then N and P (probability)
# http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.binom.html
sp.stats.distributions.binom.pmf(3, n = 10, p =.2)
#%%

'''
2. A chain of motels has adopted a policy of giving a 3% discount to customers who pay in cash 
rather than by credit cards. Its experience is that 30% of all customers take the discount. Let Y = 
number of discount takers among the next 20 customers.
    a. Do you think the binomial assumptions are reasonable in this situation?
'''
print('2a. Yes, binomial assumptions are reasonable in this siutation')
#%%
'''
 b. Assuming that the binomial probabilities apply, find the probability that exactly 5 of 
the next 20 customers take the discount.
'''
# we already have something for this!
print('2b.', binomial_pdf(20, .3, 5))
#%%
'''
c. Find P(5 or fewer customers take the discount).
'''
r_list = range(0,6)
p_5_or_fewer = 0
for r in r_list:
    prob = binomial_pdf(20, .3, r)
    p_5_or_fewer =+ p_5_or_fewer + prob
print('2c.','Probability that 5 or fewer customers will take the discount:', round(p_5_or_fewer, 3))
#%%
'''
d. What is the most probable number of discount takers in the next 20 customers?
'''
print('2d.', .3*20)
#%%
# or, to replicate the answers given in the assignment's answer key:
N = 20
theta = .3
binom_pdf_df = pd.DataFrame(index=range(0, N+1))
#%%
cumulative = 0
for index, i in binom_pdf_df.iterrows():
    probability = binomial_pdf(N, theta, index)
    binom_pdf_df.ix[index, 'P(X = x)'] = round(probability, 4)
    cumulative += probability
    binom_pdf_df.ix[index, 'P(X <= x)'] = round(cumulative, 4)
#%%
print('Binomial with n = {} and p = {}'.format(N, theta))
binom_pdf_df
#%%
# this looks crazy messy, but all it's doing is asking for the value of the index where 'P(X = x)' is greatest
binom_pdf_df.index[binom_pdf_df['P(X = x)']==binom_pdf_df['P(X = x)'].max()].item()
#%%
# All of this can be turned into a new and fairly comprehensive function:
def binom_pdf(n, theta, R=None, Return=None):
    df = pd.DataFrame(index=range(0, n+1))
    cumulative = 0
    for index, i in df.iterrows():
        r = index
        factorials = factorial(n) / (factorial(r)*factorial(n-r)) # recreating the pdf function from  so it can standalone outside this script
        probability = factorials*pow(theta, r)*pow((1-theta), (n-r)) # need to use this and it can run independantly: from math import factorial, pow
        df.ix[index, 'P(X = x)'] = round(probability, 4)
        cumulative += probability
        df.ix[index, 'P(X <= x)'] = round(cumulative, 4)
        df.ix[index, 'P(X => x)'] = round(1-cumulative, 4)
    if R == None:
        return df
    if R != None:
        if Return == None:
            return df[df.index == R]
        #let's make some plain English options for specifying the value you're looking for with this function
        if Return == 'exactly':
            return df[df.index == R]['P(X = x)'].item()
        if Return == 'at_most' or Return == 'or_fewer' or Return == 'not_more_than' :
            return df[df.index == R]['P(X <= x)'].item()
        if Return == 'at_least' or Return == 'or_more':
            return df[df.index == R-1]['P(X => x)'].item()
        if Return == 'less_than' or Return == 'fewer_than':
            return df[df.index == R-1]['P(X <= x)'].item()
        if Return == 'greater_than' or Return == 'more_than':
            return df[df.index == R]['P(X => x)'].item()
        return df[df.index == R]
#%%
# since it's optional to submit a number for R, you can leave it out and get a table with all variables
print(binom_pdf(20, .3))
#%%
# or, if you are only interested in a particular value for R:
binom_pdf(20, .3, 5)
#%%
#or, if you want to get really specific:
binom_pdf(20, .3, 5, 'or_fewer')
#%%
'''
3. The admissions office of a small, selective liberal-arts college will only offer admission to 
applicants who have a certain mix of accomplishments, including a combined SAT score of 1,400 
or more. Based on past records, the head of admissions feels that the probability is 0.66 that an 
admitted applicant will come to the college. If 500 applicants are admitted, what is the probability 
that 340 or more will come? Note that “340 or more” means the set of values {340, 341, 342, 
343, …, 499, 500}.
'''
# this is a great example for when we'd want to use the new binom_pdf function
# Instead of "R or fewer", we want "R or more"
# In other words, we want 'P(X => x)' 
# luckily I already threw in this column!
#print binom_pdf(500, .66, 339)['P(X => x)'].item()
print('3a.', binom_pdf(500, .66, 340, 'at_least'))
# Bam, we're done!
#%%
'''
4. Suppose that a full-repair warranty is offered with each new Power-Up foodprocessor. If the 
probability that any individual food processor will be returned forneeded warranty repairs within 
one year is 0.11, and if a certain store sells 83 of these,find the probabilities that...
a. at most 10 food processors will be returned for warranty repairs;
'''
print('4a.', binom_pdf(83, .11, 10, 'at_most'))

#%%
'''
b. at least 10 food processors will be returned for warranty repairs
'''
print('4b.', binom_pdf(83, .11, 10, 'at_least'))
#%%
'''
c. exactly 10 food processors will be returned for warranty repairs;
'''
print('4c.', binom_pdf(83, .11, 10, 'exactly'))
#%%
'''
d. not more than 15 food processors will be returned for warranty repairs.
'''
print('4d.', binom_pdf(83, .11, 15, 'not_more_than'))
#%%
'''
5. The rate of home sales at a small real estate agency is 1.3 per day. We’ll assume that a Poisson 
phenomenon can represent these home sales.
a. Find the probability that no homes will be sold on Monday.
'''
# First, let's write how how we'd do this if there was no existing module for Poisson probability
# we'll still cheat and use scipy to do exp()
# http://docs.scipy.org/doc/numpy/reference/routines.math.html
mean = 1.3
r = 0
poisson_probability = sp.exp(-mean) * (pow(mean,r)/factorial(r))
print('5a. part 1:', poisson_probability)
#%%
# fortunately, a fuction for Poisson probability does exists in scipy  
# http://stackoverflow.com/questions/280797/calculate-poisson-probability-percentage
no_sales = sp.stats.distributions.poisson.pmf(0, 1.3)
print('5a. part 2:', no_sales)
#%%
'''
b. Find the probability that one home will be sold on Monday.
'''
one_sale = sp.stats.distributions.poisson.pmf(1, 1.3)
print('5b.', one_sale)
#%%
'''
c. Find the probability that two homes will be sold on Monday.
'''
print('5c.', sp.stats.distributions.poisson.pmf(2, 1.3))
#%%
'''
d. Find the probability that more than two homes will be sold on Monday
'''
no_sales = sp.stats.distributions.poisson.pmf(0, 1.3)
one_sale = sp.stats.distributions.poisson.pmf(1, 1.3)
two_sales = sp.stats.distributions.poisson.pmf(2, 1.3)
more_than_two_sales = 1 - no_sales - one_sale - two_sales
print('5d.', more_than_two_sales)









    



1a. 0.201
1b. 0.346
1c. 0.204
2a. Yes, binomial assumptions are reasonable in this siutation
2b. 0.17886305056987956
2c. Probability that 5 or fewer customers will take the discount: 0.416
2d. 6.0
Binomial with n = 20 and p = 0.3
    P(X = x)  P(X <= x)  P(X => x)
0     0.0008     0.0008     0.9992
1     0.0068     0.0076     0.9924
2     0.0278     0.0355     0.9645
3     0.0716     0.1071     0.8929
4     0.1304     0.2375     0.7625
5     0.1789     0.4164     0.5836
6     0.1916     0.6080     0.3920
7     0.1643     0.7723     0.2277
8     0.1144     0.8867     0.1133
9     0.0654     0.9520     0.0480
10    0.0308     0.9829     0.0171
11    0.0120     0.9949     0.0051
12    0.0039     0.9987     0.0013
13    0.0010     0.9997     0.0003
14    0.0002     1.0000     0.0000
15    0.0000     1.0000     0.0000
16    0.0000     1.0000     0.0000
17    0.0000     1.0000     0.0000
18    0.0000     1.0000     0.0000
19    0.0000     1.0000     0.0000
20    0.0000     1.0000     0.0000
3a. 0.1852
4a. 0.6969
4b. 0.4306
4c. 0.1275
4d. 0.982
5a. part 1: 0.272531793034
5a. part 2: 0.272531793034
5b. 0.354291330944
5c. 0.230289365114
5d. 0.142887510908



In [ ]: