This notebook takes a MEPS dataset and computes ...
Work in progress...
Links
This notebook was adapted by Dave Backus from one created by Luke K. Min for use in the NYU Stern course Data Bootcamp. We thank Martin Hackmann for the suggestion and the reference to this report.
In [1]:
import pandas as pd
import numpy as np
In [2]:
file = '../csv/MEPS_H155.csv'
url = ''
meps = pd.read_csv(file)
In [4]:
meps.shape
Out[4]:
In [3]:
list(meps)
Out[3]:
In [2]:
#Take a look at how the data is formatted.
MEPS = pd.DataFrame.from_csv('DATA/H155.CSV', index_col=None)
MEPS.head()
Out[2]:
In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline
In [3]:
#What columns are there? Make sure to reference the crosswalk.
list(MEPS.columns.values)
Out[3]:
In [4]:
#This checks how many (rows, columns).
np.shape(MEPS)
Out[4]:
In [21]:
#Take a brief look at which age groups we are looking at. A histogram.
import seaborn as sns
sns.set(style="darkgrid")
year_of_birth = MEPS.DOBYY
age = 2015 - year_of_birth
sns.distplot(age, kde=False)
plt.xlabel('age')
plt.ylabel('observations')
plt.title('Age Groups')
Out[21]:
In [16]:
age.describe()
Out[16]:
In [17]:
#Now let's look at the medical expenditures column.
t = MEPS.TOTEXP12
In [18]:
total_exp = MEPS.TOTEXP12.order()
t = total_exp.reset_index().TOTEXP12
In [158]:
#We would like to see a cumulative distribution of health care spending.
cdf = np.cumsum(t) / 15221120
index1 = cdf.reset_index().index / 5175
cdf = cdf.reset_index()
cdf['x'] = index1
cdf.plot(x='x', y='TOTEXP12')
#y = cdf.index / 5175
#cdf.plot(x='y')
Out[158]:
In [113]:
#Now we would like to weave the two together - age and medical expenses.
age_exp = MEPS.groupby(['DOBYY', 'TOTEXP12']).size()
plt.scatter(x=(2012 - MEPS.DOBYY), y=MEPS.TOTEXP12)
plt.xlabel('Age')
plt.ylabel('2012 Total Medical Expenditure')
plt.title('Age vs. Medical Expenditure')
plt.axis('tight')
#Run a regression? Ask Prof Backus about this.
Out[113]:
In [30]:
age = 2012 - MEPS.DOBYY
exp = MEPS.TOTEXP12
sns.jointplot(x=age, y=exp)
Out[30]:
In [34]:
#Try running a regression?
sns.regplot(x=age, y=exp)
Out[34]:
In [36]:
#How about gender? Quick look at gender distribution. There are more female observations.
MEPS.SEX.value_counts().plot(kind='bar')
Out[36]:
In [63]:
#Gender and Expenditures
sns.stripplot(x=MEPS.SEX, y=MEPS.TOTEXP12)
Out[63]:
In [126]:
def bottom_percent(list, percent):
total = 0
for x in list:
if x < np.percentile(list, percent):
total = total + x
return total
def top_percent(list, percent):
total = 0
for x in list:
if x > np.percentile(list, percent):
total = total + x
return total
In [194]:
#Mean spending of portions of sample
def bottom_mean_spending (list, percent):
return np.mean(list[list <= np.percentile(list, percent)])
def top_mean_spending (list, percent):
return np.mean(list[list >= np.percentile(list, percent)])
Means = (bottom_mean_spending (exp, 50),
top_mean_spending (exp, 50),
top_mean_spending (exp, 70),
top_mean_spending (exp, 90),
top_mean_spending (exp, 95),
top_mean_spending (exp, 99),
top_mean_spending (exp, 99.9))
bins = ("Bottom 50%", "Top 50%", "Top 30%",
"Top 10%", "Top 5%", "Top 1%", "Top 0.1%")
sns.barplot(bins, Means)
plt.title('Mean Medical Expenditure 2012')
plt.xlabel('Percent of Population, Ordered')
plt.ylabel('Mean Annual Expenditure per Person')
Out[194]:
In [ ]:
In [ ]:
In [46]:
np.percentile(exp, 50)
Out[46]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [35]:
##THIS IS JUST A SAMPLE CODE FROM THE WEB.
import numpy as np
import matplotlib.pyplot as plt
N = 5
menMeans = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd = (2, 3, 4, 1, 2)
womenStd = (3, 5, 2, 3, 3)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars: can also be len(x) sequence
p1 = plt.bar(ind, menMeans, width, color='r', yerr=womenStd)
p2 = plt.bar(ind, womenMeans, width, color='y',
bottom=menMeans, yerr=menStd)
plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind+width/2., ('G1', 'G2', 'G3', 'G4', 'G5') )
plt.yticks(np.arange(0,81,10))
plt.legend( (p1[0], p2[0]), ('Men', 'Women') )
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: