In [1]:
import numpy as np
import scipy.stats as st
import pandas as pd
import sys

In [5]:
sys.path.append('/Users/chrismorrow/repos/sci_analysis')

In [6]:
import sci_analysis as a

In [6]:
np.random.seed(987654321)
input_array = st.norm.rvs(size=200)
a.analyze(input_array)


 
Statistics
----------
 
Count     =  200
Mean      =  0.0280
Std Dev   =  1.0721
Std Error =  0.0758
Skewness  =  0.0749
Kurtosis  = -0.4303
Maximum   =  3.1400
75%       =  0.7772
50%       =  0.0371
25%       = -0.7605
Minimum   = -2.4718
IQR       =  1.5377
Range     =  5.6117
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9947
p value =  0.7092
 
H0: Data is normally distributed
 

In [7]:
np.random.seed(987654321)
input_array = st.norm.rvs(size=200)
a.analyze(input_array, cdf=True, fit=True)


 
Statistics
----------
 
Count     =  200
Mean      =  0.0280
Std Dev   =  1.0721
Std Error =  0.0758
Skewness  =  0.0749
Kurtosis  = -0.4303
Maximum   =  3.1400
75%       =  0.7772
50%       =  0.0371
25%       = -0.7605
Minimum   = -2.4718
IQR       =  1.5377
Range     =  5.6117
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9947
p value =  0.7092
 
H0: Data is normally distributed
 

In [8]:
np.random.seed(987654321)
input_array = st.weibull_min.rvs(1.7, size=500)
a.analyze(input_array, cdf=True, fit=True, distribution='weibull_min')


 
Statistics
----------
 
Count     =  500
Mean      =  0.8904
Std Dev   =  0.5693
Std Error =  0.0255
Skewness  =  1.1975
Kurtosis  =  2.0804
Maximum   =  3.4965
75%       =  1.1472
50%       =  0.8021
25%       =  0.4599
Minimum   =  0.0632
IQR       =  0.6873
Range     =  3.4333
 
 
Kolmogorov-Smirnov Test
-----------------------
 
D value =  0.0328
p value =  0.6557
 
H0: Data is matched to the weibull_min distribution
 

In [9]:
source_path = "/Users/chrismorrow/Dropbox/Data/"
df = pd.read_csv(source_path + "/BodyFat.csv")

In [10]:
df.corr()


Out[10]:
IDNO BODYFAT DENSITY AGE WEIGHT HEIGHT ADIPOSITY NECK CHEST ABDOMEN HIP THIGH KNEE ANKLE BICEPS FOREARM WRIST Unnamed: 17
IDNO 1.000000 0.110951 -0.109605 0.341254 0.033728 0.040943 0.047717 0.071112 0.120515 0.121720 -0.023737 -0.080708 0.047939 -0.070644 -0.015677 0.001960 0.081845 NaN
BODYFAT 0.110951 1.000000 -0.988087 0.289174 0.613156 -0.089106 0.727994 0.491489 0.702885 0.813706 0.625700 0.561284 0.507786 0.266783 0.493031 0.363277 0.347573 NaN
DENSITY -0.109605 -0.988087 1.000000 -0.277637 -0.594062 0.097881 -0.714732 -0.472966 -0.682599 -0.798955 -0.609331 -0.553091 -0.495040 -0.264890 -0.487109 -0.351648 -0.325716 NaN
AGE 0.341254 0.289174 -0.277637 1.000000 -0.012746 -0.171645 0.118851 0.113505 0.176450 0.230409 -0.050332 -0.200096 0.017516 -0.105058 -0.041162 -0.085056 0.213531 NaN
WEIGHT 0.033728 0.613156 -0.594062 -0.012746 1.000000 0.308279 0.887352 0.830716 0.894191 0.887995 0.940884 0.868694 0.853167 0.613685 0.800416 0.630301 0.729775 NaN
HEIGHT 0.040943 -0.089106 0.097881 -0.171645 0.308279 1.000000 -0.024891 0.253710 0.134892 0.087813 0.170394 0.148436 0.286053 0.264744 0.207816 0.228649 0.322065 NaN
ADIPOSITY 0.047717 0.727994 -0.714732 0.118851 0.887352 -0.024891 1.000000 0.777857 0.911799 0.923880 0.883269 0.812706 0.713660 0.500317 0.746384 0.558594 0.625907 NaN
NECK 0.071112 0.491489 -0.472966 0.113505 0.830716 0.253710 0.777857 1.000000 0.784835 0.754077 0.734958 0.695697 0.672405 0.477892 0.731146 0.623660 0.744826 NaN
CHEST 0.120515 0.702885 -0.682599 0.176450 0.894191 0.134892 0.911799 0.784835 1.000000 0.915828 0.829420 0.729859 0.719496 0.482988 0.727907 0.580173 0.660162 NaN
ABDOMEN 0.121720 0.813706 -0.798955 0.230409 0.887995 0.087813 0.923880 0.754077 0.915828 1.000000 0.874066 0.766624 0.737179 0.453223 0.684983 0.503316 0.619832 NaN
HIP -0.023737 0.625700 -0.609331 -0.050332 0.940884 0.170394 0.883269 0.734958 0.829420 0.874066 1.000000 0.896410 0.823473 0.558387 0.739273 0.545014 0.630090 NaN
THIGH -0.080708 0.561284 -0.553091 -0.200096 0.868694 0.148436 0.812706 0.695697 0.729859 0.766624 0.896410 1.000000 0.799170 0.539797 0.761477 0.566842 0.558685 NaN
KNEE 0.047939 0.507786 -0.495040 0.017516 0.853167 0.286053 0.713660 0.672405 0.719496 0.737179 0.823473 0.799170 1.000000 0.611608 0.678709 0.555898 0.664507 NaN
ANKLE -0.070644 0.266783 -0.264890 -0.105058 0.613685 0.264744 0.500317 0.477892 0.482988 0.453223 0.558387 0.539797 0.611608 1.000000 0.484855 0.419050 0.566195 NaN
BICEPS -0.015677 0.493031 -0.487109 -0.041162 0.800416 0.207816 0.746384 0.731146 0.727907 0.684983 0.739273 0.761477 0.678709 0.484855 1.000000 0.678255 0.632126 NaN
FOREARM 0.001960 0.363277 -0.351648 -0.085056 0.630301 0.228649 0.558594 0.623660 0.580173 0.503316 0.545014 0.566842 0.555898 0.419050 0.678255 1.000000 0.585588 NaN
WRIST 0.081845 0.347573 -0.325716 0.213531 0.729775 0.322065 0.625907 0.744826 0.660162 0.619832 0.630090 0.558685 0.664507 0.566195 0.632126 0.585588 1.000000 NaN
Unnamed: 17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [11]:
df['BMI'] = (df['WEIGHT'] / df['HEIGHT'] ** 2) * 703
df['Age_Range'] = pd.cut(df['AGE'].values, [20, 26, 33, 39, 45, 51, 57, 63, 69, 75, 81])
df['Class'] = pd.cut(df['BMI'].values, [0, 15, 16, 18.5, 25, 30, 35, 40, 200], 
                     labels=['Very Severly Underweight', 'Severely Underweight', 'Underweight', 'Normal', 
                             'Overweight', 'Moderately Obese', 'Severely Obese', 'Very Severely Obese'])

In [12]:
a.analyze(df['ABDOMEN'], df['BODYFAT'], xname='Abdomen', yname='Bodyfat')


 
Linear Regression
-----------------
 
Count     =  252
Slope     =  0.5849
Intercept = -35.1966
R^2       =  0.8137
Std Err   =  0.0264
p value   =  0.0000
 
HA: There is a significant relationship between predictor and response
 
 
Spearman Correlation Coefficient
--------------------------------
 
r value =  0.8155
p value =  0.0000
 
HA: There is a significant relationship between predictor and response
 

In [13]:
a.analyze(df['ABDOMEN'], df['BODYFAT'], xname='Abdomen', yname='Bodyfat', boxplot_borders=True)


 
Linear Regression
-----------------
 
Count     =  252
Slope     =  0.5849
Intercept = -35.1966
R^2       =  0.8137
Std Err   =  0.0264
p value   =  0.0000
 
HA: There is a significant relationship between predictor and response
 
 
Spearman Correlation Coefficient
--------------------------------
 
r value =  0.8155
p value =  0.0000
 
HA: There is a significant relationship between predictor and response
 

In [14]:
a.analyze(df['ABDOMEN'], df['BODYFAT'], xname='Abdomen', yname='Bodyfat', boxplot_borders=True, fit=False, 
          contours=True)


 
Linear Regression
-----------------
 
Count     =  252
Slope     =  0.5849
Intercept = -35.1966
R^2       =  0.8137
Std Err   =  0.0264
p value   =  0.0000
 
HA: There is a significant relationship between predictor and response
 
 
Spearman Correlation Coefficient
--------------------------------
 
r value =  0.8155
p value =  0.0000
 
HA: There is a significant relationship between predictor and response
 

In [15]:
a.analyze(df['BODYFAT'], name='Bodyfat', fit=True)


 
Statistics
----------
 
Count     =  252
Mean      =  18.9385
Std Dev   =  7.7509
Std Error =  0.4883
Skewness  =  0.1434
Kurtosis  = -0.3245
Maximum   =  45.1000
75%       =  24.6000
50%       =  19.0000
25%       =  12.8000
Minimum   =  0.0000
IQR       =  11.8000
Range     =  45.1000
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9929
p value =  0.2747
 
H0: Data is normally distributed
 

In [16]:
a.analyze(df['AGE'], name='Age')


 
Statistics
----------
 
Count     =  252
Mean      =  44.8849
Std Dev   =  12.6020
Std Error =  0.7939
Skewness  =  0.2818
Kurtosis  = -0.4319
Maximum   =  81.0000
75%       =  54.0000
50%       =  43.0000
25%       =  35.7500
Minimum   =  22.0000
IQR       =  18.2500
Range     =  59.0000
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9795
p value =  0.0010
 
HA: Data is not normally distributed
 

In [17]:
df.groupby('Age_Range').mean()['BODYFAT']


Out[17]:
Age_Range
(20, 26]    15.105000
(26, 33]    15.106897
(33, 39]    17.815385
(39, 45]    20.053968
(45, 51]    19.044186
(51, 57]    17.828125
(57, 63]    22.242857
(63, 69]    24.275000
(69, 75]    24.425000
(75, 81]    21.100000
Name: BODYFAT, dtype: float64

In [18]:
df2 = df[df['HEIGHT'] > 60]
height = {name: group['HEIGHT'].values for name, group in df2[df2['Age_Range'] != '(75, 81]'].groupby('Age_Range')}
a.analyze(height)


Group Statistics
 
Count         Mean          Std Dev       Min           Median        Max           Group         
--------------------------------------------------------------------------------------------------
26             70.2308       2.4412        65.5000       70.6250       74.5000      (33, 39]      
14             68.8393       2.4369        65.7500       67.7500       73.2500      (57, 63]      
62             70.6452       2.3195        65.5000       70.1250       76.0000      (39, 45]      
20             71.9000       2.6462        66.2500       72.2500       77.5000      (20, 26]      
43             70.4535       2.8000        64.0000       70.7500       75.0000      (45, 51]      
16             68.8125       2.2922        65.7500       68.5000       72.7500      (63, 69]      
32             70.1328       2.8575        64.0000       69.7500       77.7500      (51, 57]      
29             70.5345       2.6201        64.7500       71.0000       76.0000      (26, 33]      
8              68.7188       1.7031        66.0000       69.5000       70.5000      (69, 75]      
 
 
Bartlett Test
-------------
 
p value =  0.7671
T value =  4.9101
 
H0: Variances are equal
 
 
Oneway ANOVA
------------
 
p value =  0.0049
f value =  2.8454
 
HA: Group means are not matched
 

In [19]:
df2 = df[df['HEIGHT'] > 60]
groups = list()
height = list()
for name, group in df2[df2['Age_Range'] != '(75, 81]'].groupby('Age_Range'):
    groups.append(name)
    height.append(group['HEIGHT'].values)
a.analyze(height, groups=groups, categories='Age Group', name='Height', title='Height by Age Group')


Group Statistics
 
Count         Mean          Std Dev       Min           Median        Max           Group         
--------------------------------------------------------------------------------------------------
26             70.2308       2.4412        65.5000       70.6250       74.5000      (33, 39]      
14             68.8393       2.4369        65.7500       67.7500       73.2500      (57, 63]      
43             70.4535       2.8000        64.0000       70.7500       75.0000      (45, 51]      
20             71.9000       2.6462        66.2500       72.2500       77.5000      (20, 26]      
62             70.6452       2.3195        65.5000       70.1250       76.0000      (39, 45]      
16             68.8125       2.2922        65.7500       68.5000       72.7500      (63, 69]      
32             70.1328       2.8575        64.0000       69.7500       77.7500      (51, 57]      
29             70.5345       2.6201        64.7500       71.0000       76.0000      (26, 33]      
8              68.7188       1.7031        66.0000       69.5000       70.5000      (69, 75]      
 
 
Bartlett Test
-------------
 
p value =  0.7671
T value =  4.9101
 
H0: Variances are equal
 
 
Oneway ANOVA
------------
 
p value =  0.0049
f value =  2.8454
 
HA: Group means are not matched
 

In [20]:
a.analyze([df['BODYFAT'].groupby(df['Class']).get_group('Normal'),
          df['BODYFAT'].groupby(df['Class']).get_group('Overweight')], groups=['Normal', 'Overweight'])


Group Statistics
 
Count         Mean          Std Dev       Min           Median        Max           Group         
--------------------------------------------------------------------------------------------------
102            22.1098       5.3771        8.5000        21.8000       36.5000      Overweight    
124            14.3089       6.0051        1.9000        13.5500       28.4000      Normal        
 
 
Mann Whitney U Test
-------------------
 
u value =  2197.5000
p value =  0.0000
 
HA: Locations are not matched
 

In [21]:
a.analyze([df['HEIGHT'].groupby(df['Class']).get_group('Normal'),
          df['HEIGHT'].groupby(df['Class']).get_group('Overweight')], groups=['Normal', 'Overweight'])


Group Statistics
 
Count         Mean          Std Dev       Min           Median        Max           Group         
--------------------------------------------------------------------------------------------------
102            70.6201       2.6608        65.5000       70.0000       77.7500      Overweight    
124            70.1593       2.5428        64.0000       70.2500       77.5000      Normal        
 
 
T Test
------
 
t value = -1.3276
p value =  0.1857
 
H0: Means are matched
 

In [22]:
bb = pd.read_csv(source_path + "/ncaa_2016/player_stats.csv")

In [23]:
bb.columns


Out[23]:
Index(['Team', 'Division', 'Player', 'Year', 'Season', 'Position', 'Height',
       'Games', 'fg_m', 'fg_a', 'fg_p', 'fgmpg', 'pt3_m', 'pt3_a', 'pt3_p',
       'pt3mpg', 'ft_m', 'ft_a', 'ft_p', 'ftmpg', 'rb_num', 'rb_avg', 'rbmpg',
       'ast_num', 'ast_avg', 'astpg', 'blk_num', 'blk_avg', 'blkpg', 'st_num',
       'st_avg', 'stlpg', 'points_num', 'points_avg', 'topg', 'double_doubles',
       'triple_doubles'],
      dtype='object')

In [26]:
a.analyze(bb['fgmpg'], bb['astpg'])


 
Linear Regression
-----------------
 
Count     =  10836
Slope     =  0.4207
Intercept =  0.1884
R^2       =  0.6679
Std Err   =  0.0045
p value   =  0.0000
 
Spearman Correlation Coefficient
--------------------------------
 
r value =  0.8176
p value =  0.0000
/Users/chrismorrow/miniconda3/lib/python3.5/site-packages/scipy/stats/morestats.py:1330: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

In [27]:
a.analyze(bb['points_avg'])


/Users/chrismorrow/miniconda3/lib/python3.5/site-packages/scipy/stats/morestats.py:1330: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")
 
Statistics
----------
 
Count     =  9236
Mean      =  6.1154
Std Dev   =  5.0549
Std Error =  0.0526
Skewness  =  0.9131
Kurtosis  =  0.1606
Maximum   =  29.6790
75%       =  9.5710
50%       =  4.7500
25%       =  1.8180
Minimum   = -0.5000
IQR       =  7.7530
Range     =  30.1790
 
 
Shapiro-Wilk test for normality
-------------------------------
 
W value =  0.9073
p value =  0.0000
 
HA: Data is not normally distributed
 

In [33]:
a.analyze({name: data['fgmpg'] for name, data in bb.groupby('Year')})


Group Statistics
 
Count         Mean          Std Dev       Min           Median        Max           Group         
--------------------------------------------------------------------------------------------------
2317           2.7631        2.0420        0.0000        2.6130        9.6670       Sr.           
2963           1.0605        1.2963        0.0000        0.5830        7.6540       Fr.           
2496           1.7069        1.6590        0.0000        1.1970        8.0950       So.           
2688           2.1218        1.8286        0.0000        1.7390        9.2170       Jr.           
372            1.0255        1.3321        0.0000        0.5180        8.2220       ---           
 
 
Levene Test
-----------
 
W value =  216.2903
p value =  0.0000
 
HA: Variances are not equal
 
 
Kruskal-Wallis
--------------
 
p value =  0.0000
h value =  1272.7549
 
HA: Group means are not matched
 

In [ ]: