In [11]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
"""
Type 1 - Count & Probability for Discrete variable
Type 2 - Equal Bin Size, Count & Probability for Continuous variable
Type 3 - Normalized Count for Discrete variable
Type 4 - Equal Bin Size, Normalized Count for Continuous variable
"""
In [2]:
preg_raw = pd.read_csv("2002FemPregOut.csv")
reps_raw = pd.read_csv("2002FemRespOut.csv")
In [4]:
preg_raw = preg_raw.dropna()
preg_raw.head()
Out[4]:
In [8]:
birthorder_ct = preg_raw[['birthord', 'caseid']]\
.groupby(['birthord'])['caseid']\
.agg(['count'])\
.sort_values(['count'], ascending=False)
print birthorder_ct
print birthorder_ct.sum()
In [10]:
# Type 1 - Count & Probability for Discrete variable
birthorder_ct['prob'] = birthorder_ct/birthorder_ct.sum()
birthorder_ct
Out[10]:
In [22]:
birthorder_ct.plot.bar(x=birthorder_ct.index, y='prob')
Out[22]:
In [26]:
# Type 3 - Normalized Count for Discrete variable
## normalize to [0,1] range
birthorder_ct['normalized_ct'] = (birthorder_ct['count']-birthorder_ct['count'].min())/(birthorder_ct['count'].max()-birthorder_ct['count'].min())
birthorder_ct
Out[26]:
In [27]:
birthorder_ct.plot.bar(x=birthorder_ct.index, y='normalized_ct')
Out[27]:
In [30]:
# Type 2 - Equal Bin Size, Count & Probability for Continuous variable
## by default, 10 bins
hist_vals, bin_edges = np.histogram(preg_raw['prglngth'])
print hist_vals
print bin_edges
In [32]:
plt.bar(bin_edges[:-1], hist_vals, width = 1)
plt.xlim(min(bin_edges), max(bin_edges))
plt.show()
In [40]:
# Type 4 - Equal Bin Size, Normalized Count for Continuous variable
min_hist_val = min(hist_vals)
max_hist_val = max(hist_vals)
normalized_hist_vals = [(elem-min_hist_val)*1.0/(max_hist_val-min_hist_val) for elem in hist_vals]
normalized_hist_vals
Out[40]:
In [41]:
plt.bar(bin_edges[:-1], normalized_hist_vals, width = 1)
plt.xlim(min(bin_edges), max(bin_edges))
plt.show()