In [1]:
import numpy as np
import scipy.stats as st
import pandas as pd
import sys

In [6]:
sys.path.append('/Users/chrismorrow/repos/sci_analysis')

In [7]:
import sci_analysis as a

In [8]:
np.random.seed(987654321)
input_array = st.norm.rvs(size=200)
a.analyze(input_array)


 
Statistics
----------
 
Count     =  200
Mean      =  0.0280
Std Dev   =  1.0721
Std Error =  0.0758
Skewness  =  0.0749
Kurtosis  = -0.4303
Maximum   =  3.1400
75%       =  0.7772
50%       =  0.0371
25%       = -0.7605
Minimum   = -2.4718
IQR       =  1.5377
Range     =  5.6117
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9947
p value =  0.7092
 
H0: Data is normally distributed
 

In [9]:
np.random.seed(987654321)
input_array = st.norm.rvs(size=200)
a.analyze(input_array, cdf=True, fit=True)


 
Statistics
----------
 
Count     =  200
Mean      =  0.0280
Std Dev   =  1.0721
Std Error =  0.0758
Skewness  =  0.0749
Kurtosis  = -0.4303
Maximum   =  3.1400
75%       =  0.7772
50%       =  0.0371
25%       = -0.7605
Minimum   = -2.4718
IQR       =  1.5377
Range     =  5.6117
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9947
p value =  0.7092
 
H0: Data is normally distributed
 

In [10]:
np.random.seed(987654321)
input_array = st.weibull_min.rvs(1.7, size=500)
a.analyze(input_array, cdf=True, fit=True, distribution='weibull_min')


 
Statistics
----------
 
Count     =  500
Mean      =  0.8904
Std Dev   =  0.5693
Std Error =  0.0255
Skewness  =  1.1975
Kurtosis  =  2.0804
Maximum   =  3.4965
75%       =  1.1472
50%       =  0.8021
25%       =  0.4599
Minimum   =  0.0632
IQR       =  0.6873
Range     =  3.4333
 
 
Kolmogorov-Smirnov Test
-----------------------
 
D value =  0.0328
p value =  0.6558
 
H0: Data is matched to the weibull_min distribution
 

In [11]:
source_path = "/Users/chrismorrow/Dropbox/Data/"
df = pd.read_csv(source_path + "/BodyFat.csv")

In [12]:
df.corr()


Out[12]:
IDNO BODYFAT DENSITY AGE WEIGHT HEIGHT ADIPOSITY NECK CHEST ABDOMEN HIP THIGH KNEE ANKLE BICEPS FOREARM WRIST Unnamed: 17
IDNO 1.000000 0.110951 -0.109605 0.341254 0.033728 0.040943 0.047717 0.071112 0.120515 0.121720 -0.023737 -0.080708 0.047939 -0.070644 -0.015677 0.001960 0.081845 NaN
BODYFAT 0.110951 1.000000 -0.988087 0.289174 0.613156 -0.089106 0.727994 0.491489 0.702885 0.813706 0.625700 0.561284 0.507786 0.266783 0.493031 0.363277 0.347573 NaN
DENSITY -0.109605 -0.988087 1.000000 -0.277637 -0.594062 0.097881 -0.714732 -0.472966 -0.682599 -0.798955 -0.609331 -0.553091 -0.495040 -0.264890 -0.487109 -0.351648 -0.325716 NaN
AGE 0.341254 0.289174 -0.277637 1.000000 -0.012746 -0.171645 0.118851 0.113505 0.176450 0.230409 -0.050332 -0.200096 0.017516 -0.105058 -0.041162 -0.085056 0.213531 NaN
WEIGHT 0.033728 0.613156 -0.594062 -0.012746 1.000000 0.308279 0.887352 0.830716 0.894191 0.887995 0.940884 0.868694 0.853167 0.613685 0.800416 0.630301 0.729775 NaN
HEIGHT 0.040943 -0.089106 0.097881 -0.171645 0.308279 1.000000 -0.024891 0.253710 0.134892 0.087813 0.170394 0.148436 0.286053 0.264744 0.207816 0.228649 0.322065 NaN
ADIPOSITY 0.047717 0.727994 -0.714732 0.118851 0.887352 -0.024891 1.000000 0.777857 0.911799 0.923880 0.883269 0.812706 0.713660 0.500317 0.746384 0.558594 0.625907 NaN
NECK 0.071112 0.491489 -0.472966 0.113505 0.830716 0.253710 0.777857 1.000000 0.784835 0.754077 0.734958 0.695697 0.672405 0.477892 0.731146 0.623660 0.744826 NaN
CHEST 0.120515 0.702885 -0.682599 0.176450 0.894191 0.134892 0.911799 0.784835 1.000000 0.915828 0.829420 0.729859 0.719496 0.482988 0.727907 0.580173 0.660162 NaN
ABDOMEN 0.121720 0.813706 -0.798955 0.230409 0.887995 0.087813 0.923880 0.754077 0.915828 1.000000 0.874066 0.766624 0.737179 0.453223 0.684983 0.503316 0.619832 NaN
HIP -0.023737 0.625700 -0.609331 -0.050332 0.940884 0.170394 0.883269 0.734958 0.829420 0.874066 1.000000 0.896410 0.823473 0.558387 0.739273 0.545014 0.630090 NaN
THIGH -0.080708 0.561284 -0.553091 -0.200096 0.868694 0.148436 0.812706 0.695697 0.729859 0.766624 0.896410 1.000000 0.799170 0.539797 0.761477 0.566842 0.558685 NaN
KNEE 0.047939 0.507786 -0.495040 0.017516 0.853167 0.286053 0.713660 0.672405 0.719496 0.737179 0.823473 0.799170 1.000000 0.611608 0.678709 0.555898 0.664507 NaN
ANKLE -0.070644 0.266783 -0.264890 -0.105058 0.613685 0.264744 0.500317 0.477892 0.482988 0.453223 0.558387 0.539797 0.611608 1.000000 0.484855 0.419050 0.566195 NaN
BICEPS -0.015677 0.493031 -0.487109 -0.041162 0.800416 0.207816 0.746384 0.731146 0.727907 0.684983 0.739273 0.761477 0.678709 0.484855 1.000000 0.678255 0.632126 NaN
FOREARM 0.001960 0.363277 -0.351648 -0.085056 0.630301 0.228649 0.558594 0.623660 0.580173 0.503316 0.545014 0.566842 0.555898 0.419050 0.678255 1.000000 0.585588 NaN
WRIST 0.081845 0.347573 -0.325716 0.213531 0.729775 0.322065 0.625907 0.744826 0.660162 0.619832 0.630090 0.558685 0.664507 0.566195 0.632126 0.585588 1.000000 NaN
Unnamed: 17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [13]:
df['BMI'] = (df['WEIGHT'] / df['HEIGHT'] ** 2) * 703
df['Age_Range'] = pd.cut(df['AGE'].values, [20, 26, 33, 39, 45, 51, 57, 63, 69, 75, 81])
df['Class'] = pd.cut(df['BMI'].values, [0, 15, 16, 18.5, 25, 30, 35, 40, 200], 
                     labels=['Very Severly Underweight', 'Severely Underweight', 'Underweight', 'Normal', 
                             'Overweight', 'Moderately Obese', 'Severely Obese', 'Very Severely Obese'])

In [17]:
a.analyze(df['ABDOMEN'], df['BODYFAT'], xname='Abdomen', yname='Bodyfat')


 
Linear Regression
-----------------
 
Count     =  252
Slope     =  0.5849
Intercept = -35.1966
R^2       =  0.8137
Std Err   =  0.0264
p value   =  0.0000
 
HA: There is a significant relationship between predictor and response
 
 
Spearman Correlation Coefficient
--------------------------------
 
p value =  0.0000
r value =  0.8155
 
HA: There is a significant relationship between predictor and response
 

In [26]:
a.analyze(df['ABDOMEN'], df['BODYFAT'], xname='Abdomen', yname='Bodyfat', boxplot_borders=True)


 
Linear Regression
-----------------
 
Count     =  252
Slope     =  0.5849
Intercept = -35.1966
R^2       =  0.8137
Std Err   =  0.0264
p value   =  0.0000
 
HA: There is a significant relationship between predictor and response
 
 
Spearman Correlation Coefficient
--------------------------------
 
p value =  0.0000
r value =  0.8155
 
HA: There is a significant relationship between predictor and response
 

In [29]:
a.analyze(df['ABDOMEN'], df['BODYFAT'], xname='Abdomen', yname='Bodyfat', boxplot_borders=True, fit=False, 
          contours=True)


 
Linear Regression
-----------------
 
Count     =  252
Slope     =  0.5849
Intercept = -35.1966
R^2       =  0.8137
Std Err   =  0.0264
p value   =  0.0000
 
HA: There is a significant relationship between predictor and response
 
 
Spearman Correlation Coefficient
--------------------------------
 
p value =  0.0000
r value =  0.8155
 
HA: There is a significant relationship between predictor and response
 

In [20]:
a.analyze(df['BODYFAT'], name='Bodyfat', fit=True)


 
Statistics
----------
 
Count     =  252
Mean      =  18.9385
Std Dev   =  7.7509
Std Error =  0.4883
Skewness  =  0.1434
Kurtosis  = -0.3245
Maximum   =  45.1000
75%       =  24.6000
50%       =  19.0000
25%       =  12.8000
Minimum   =  0.0000
IQR       =  11.8000
Range     =  45.1000
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9929
p value =  0.2747
 
H0: Data is normally distributed
 

In [21]:
a.analyze(df['AGE'], name='Age')


 
Statistics
----------
 
Count     =  252
Mean      =  44.8849
Std Dev   =  12.6020
Std Error =  0.7939
Skewness  =  0.2818
Kurtosis  = -0.4319
Maximum   =  81.0000
75%       =  54.0000
50%       =  43.0000
25%       =  35.7500
Minimum   =  22.0000
IQR       =  18.2500
Range     =  59.0000
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9795
p value =  0.0010
 
HA: Data is not normally distributed
 

In [31]:
df.groupby('Age_Range').mean()['BODYFAT']


Out[31]:
Age_Range
(20, 26]    15.105000
(26, 33]    15.106897
(33, 39]    17.815385
(39, 45]    20.053968
(45, 51]    19.044186
(51, 57]    17.828125
(57, 63]    22.242857
(63, 69]    24.275000
(69, 75]    24.425000
(75, 81]    21.100000
Name: BODYFAT, dtype: float64

In [18]:
df2 = df[df['HEIGHT'] > 60]
height = {name: group['HEIGHT'].values for name, group in df2[df2['Age_Range'] != '(75, 81]'].groupby('Age_Range')}
a.analyze(height)


Group Statistics
 
Count         Mean          Std Dev       Min           Median        Max           Group         
--------------------------------------------------------------------------------------------------
43             70.4535       2.8000        64.0000       70.7500       75.0000      (45, 51]      
14             68.8393       2.4369        65.7500       67.7500       73.2500      (57, 63]      
16             68.8125       2.2922        65.7500       68.5000       72.7500      (63, 69]      
8              68.7188       1.7031        66.0000       69.5000       70.5000      (69, 75]      
29             70.5345       2.6201        64.7500       71.0000       76.0000      (26, 33]      
62             70.6452       2.3195        65.5000       70.1250       76.0000      (39, 45]      
32             70.1328       2.8575        64.0000       69.7500       77.7500      (51, 57]      
26             70.2308       2.4412        65.5000       70.6250       74.5000      (33, 39]      
20             71.9000       2.6462        66.2500       72.2500       77.5000      (20, 26]      
 
 
Bartlett Test
-------------
 
T value =  4.9101
p value =  0.7671
 
H0: Variances are equal
 
 
Oneway ANOVA
------------
 
f value =  2.8454
p value =  0.0049
 
HA: Group means are not matched
 

In [25]:
df2 = df[df['HEIGHT'] > 60]
groups = list()
height = list()
for name, group in df2[df2['Age_Range'] != '(75, 81]'].groupby('Age_Range'):
    groups.append(name)
    height.append(group['HEIGHT'].values)
a.analyze(height, groups=groups, categories='Age Group', name='Height', title='Height by Age Group')


Group Statistics
 
Count         Mean          Std Dev       Min           Median        Max           Group         
--------------------------------------------------------------------------------------------------
43             70.4535       2.8000        64.0000       70.7500       75.0000      (45, 51]      
14             68.8393       2.4369        65.7500       67.7500       73.2500      (57, 63]      
16             68.8125       2.2922        65.7500       68.5000       72.7500      (63, 69]      
8              68.7188       1.7031        66.0000       69.5000       70.5000      (69, 75]      
29             70.5345       2.6201        64.7500       71.0000       76.0000      (26, 33]      
62             70.6452       2.3195        65.5000       70.1250       76.0000      (39, 45]      
32             70.1328       2.8575        64.0000       69.7500       77.7500      (51, 57]      
26             70.2308       2.4412        65.5000       70.6250       74.5000      (33, 39]      
20             71.9000       2.6462        66.2500       72.2500       77.5000      (20, 26]      
 
 
Bartlett Test
-------------
 
T value =  4.9101
p value =  0.7671
 
H0: Variances are equal
 
 
Oneway ANOVA
------------
 
f value =  2.8454
p value =  0.0049
 
HA: Group means are not matched
 

In [ ]: