In [11]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

"""
Type 1 - Count & Probability for Discrete variable
Type 2 - Equal Bin Size, Count & Probability for Continuous variable
Type 3 - Normalized Count for Discrete variable
Type 4 - Equal Bin Size, Normalized Count for Continuous variable
"""

In [2]:
preg_raw = pd.read_csv("2002FemPregOut.csv")
reps_raw = pd.read_csv("2002FemRespOut.csv")

In [4]:
preg_raw = preg_raw.dropna()
preg_raw.head()


Out[4]:
caseid nbrnaliv babysex birthwgt_lb birthwgt_oz prglngth outcome birthord agepreg finalwgt
0 1 1.0 1.0 8.0 13.0 39 1 1.0 3316.0 6448.271112
1 1 1.0 2.0 7.0 14.0 39 1 2.0 3925.0 6448.271112
2 2 3.0 1.0 9.0 2.0 39 1 1.0 1433.0 12999.542264
3 2 1.0 2.0 7.0 0.0 39 1 2.0 1783.0 12999.542264
4 2 1.0 2.0 6.0 3.0 39 1 3.0 1833.0 12999.542264

In [8]:
birthorder_ct = preg_raw[['birthord', 'caseid']]\
                      .groupby(['birthord'])['caseid']\
                      .agg(['count'])\
                      .sort_values(['count'], ascending=False)
print birthorder_ct
print birthorder_ct.sum()


          count
birthord       
1.0        4381
2.0        2861
3.0        1225
4.0         417
5.0         125
6.0          49
7.0          19
8.0           7
9.0           2
10.0          1
count    9087
dtype: int64

In [10]:
# Type 1 - Count & Probability for Discrete variable
birthorder_ct['prob'] = birthorder_ct/birthorder_ct.sum()
birthorder_ct


Out[10]:
count prob
birthord
1.0 4381 0.482117
2.0 2861 0.314845
3.0 1225 0.134808
4.0 417 0.045890
5.0 125 0.013756
6.0 49 0.005392
7.0 19 0.002091
8.0 7 0.000770
9.0 2 0.000220
10.0 1 0.000110

In [22]:
birthorder_ct.plot.bar(x=birthorder_ct.index, y='prob')


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ee94890>

In [26]:
# Type 3 - Normalized Count for Discrete variable
## normalize to [0,1] range
birthorder_ct['normalized_ct'] = (birthorder_ct['count']-birthorder_ct['count'].min())/(birthorder_ct['count'].max()-birthorder_ct['count'].min())
birthorder_ct


Out[26]:
count prob normalized_ct
birthord
1.0 4381 0.482117 1.000000
2.0 2861 0.314845 0.652968
3.0 1225 0.134808 0.279452
4.0 417 0.045890 0.094977
5.0 125 0.013756 0.028311
6.0 49 0.005392 0.010959
7.0 19 0.002091 0.004110
8.0 7 0.000770 0.001370
9.0 2 0.000220 0.000228
10.0 1 0.000110 0.000000

In [27]:
birthorder_ct.plot.bar(x=birthorder_ct.index, y='normalized_ct')


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f182c50>

In [30]:
# Type 2 - Equal Bin Size, Count & Probability for Continuous variable
## by default, 10 bins
hist_vals, bin_edges = np.histogram(preg_raw['prglngth'])
print hist_vals
print bin_edges


[   2    1    0    4   24   92  387 6336 2220   21]
[  0.   5.  10.  15.  20.  25.  30.  35.  40.  45.  50.]

In [32]:
plt.bar(bin_edges[:-1], hist_vals, width = 1)
plt.xlim(min(bin_edges), max(bin_edges))
plt.show()



In [40]:
# Type 4 - Equal Bin Size, Normalized Count for Continuous variable
min_hist_val = min(hist_vals)
max_hist_val = max(hist_vals)
normalized_hist_vals = [(elem-min_hist_val)*1.0/(max_hist_val-min_hist_val) for elem in hist_vals]
normalized_hist_vals


Out[40]:
[0.00031565656565656568,
 0.00015782828282828284,
 0.0,
 0.00063131313131313137,
 0.003787878787878788,
 0.01452020202020202,
 0.061079545454545456,
 1.0,
 0.3503787878787879,
 0.0033143939393939395]

In [41]:
plt.bar(bin_edges[:-1], normalized_hist_vals, width = 1)
plt.xlim(min(bin_edges), max(bin_edges))
plt.show()