In [18]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import the data as a panda dataframe
df = pd.read_csv('data/human_body_temperature.csv')
# assign a subset as male and female dataframes
dfMale=df[df.gender=='M']
dfFemale=df[df.gender=='F']
# create a distribution plot via seaborn, ensure correct version of statsmodels=0.8.0
sns.distplot(df.temperature)
# create a scatterplot of male and female temperature vs heart rate in one chart
ax=dfMale.plot(kind='scatter', x='temperature', y='heart_rate', color='Blue', label='Male')
dfFemale.plot(kind='scatter', x='temperature', y='heart_rate', color='Green', label='Female', ax=ax )
print("ALL: ", df.describe())
print("Male: ", dfMale.describe())
print("Female: ", dfFemale.describe())
In [3]:
import matplotlib.pyplot as plt
import scipy.stats as stats
pval = (stats.normaltest(df['temperature']))[1]
#print(pval)
if(pval < 0.05):
print ("Not normal distribution")
else:
print ("Normal distribution")
'''Necessary Sample Size = (Z-score)2 * StdDev*(1-StdDev) / (margin of error)2
Here is how the math works assuming you chose a 95% confidence level,
.5 standard deviation, and a margin of error (confidence interval) of +/- 5%.
((1.96)2 x .5(.5)) / (.05)2
(3.8416 x .25) / .0025
.9604 / .0025
384.16
385 respondents are needed'''
if df.shape[0] >= 30:
print ("Sample size is large")
else:
print ("Sample size is NOT large")
''' the occurrence of one observation provides no information about the occurrence of the other observation.
Observations don't affect the probability for future observations.
unrelated observations
'''
In [4]:
from statsmodels.stats.weightstats import DescrStatsW, ttest_ind, ztest
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
import math
print('Male vs Female temperature: (ttest, p-value, df)=', \
ttest_ind(dfMale.temperature, dfFemale.temperature, alternative='two-sided'))
print('Male vs Female heart beat: (ttest, p-value, df)=', \
ttest_ind(dfMale.heart_rate, dfFemale.heart_rate, alternative='two-sided'))
print('(ztest, p-value)=', ztest(df['temperature'], value=98.6, alternative='two-sided'))
print('CI tconfint=', DescrStatsW(df['temperature']).tconfint_mean())
z_critical = stats.norm.ppf(q = 0.975) # Get the z-critical value*
sample_size= df.shape[0]
sample_mean= df['temperature'].mean()
pop_stdev = df['temperature'].std() # Get the population standard deviation
margin_of_error = z_critical * (pop_stdev/math.sqrt(sample_size))
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)
print("margin_of_error:", margin_of_error)
In [5]:
import numpy as np
def cohens_d(x, y):
lx = len(x)- 1
ly = len(y)- 1
md = np.abs(x.mean() - y.mean()) ## mean difference (numerator)
csd = lx * x.var() + ly * y.var()
csd = csd/(lx + ly)
#print(md)
csd = np.sqrt(csd) ## common sd computation
return md/csd ## cohen's d
def printCohen(x):
if x >= .80:
print("large effect")
elif x >= .50:
print("medium effect")
elif x >= .20:
print("small effect")
else: print("no effect")
return x
cd=cohens_d(dfMale,dfFemale)
print("cohens D: male vs female")
print("(temperature)")
print(printCohen(cd[0]))
print("(heart_beat)")
print(printCohen(cd[1]))
In [ ]: