In [1]:
import dask.dataframe as dd
import pandas as pd
import dask.multiprocessing
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
# from dask.distributed import Executor
# e = Executor(set_as_default=True)
In [2]:
raw = pd.read_csv('/home/datauser/cpsc/data/processed/neiss/neiss-2015.csv')
raw.head()
Out[2]:
In [ ]:
# top ten hospitals, one seems to be out
raw.psu.value_counts()[0:9]
In [58]:
#top ten most reported products
raw.prod1.value_counts()[0:9]
Out[58]:
In [52]:
class neiss_cleaner(object):
def __init__(self, data):
self.data = data
@staticmethod
def append_string(append_type, row):
if append_type == 'hospital':
return 'hosp_' + row['psu'].__str__()
elif append_type == 'product':
return 'product_' + row['prod1'].__str__()
else:
raise Exception('not valid append type')
@staticmethod
def recode_race(row):
if row['race'] == 1:
return 'white'
elif row['race'] == 2:
return 'black'
elif row['race'] == 3 and row['race_other'] == 'HISPANIC':
return 'hispanic'
else:
return 'other'
@property
def processed_data(self):
data = self.data
data['hospital'] = data.apply(lambda x: self.append_string('hospital', x), axis=1)
data['product'] = data.apply(lambda x: self.append_string('product', x), axis=1)
data['new_race'] = raw.apply(lambda x: self.recode_race(x), axis=1)
return data
@property
def crosstab(self):
grouped = pd.crosstab(self.data['hospital'], self.data['product'])
return grouped
class neiss_query(object):
def __init__(self, cleaned_data, crosstab):
self.data = cleaned_data
self.crosstab = crosstab
def retrieve_query(self, group_name, group_value, query_name, top_num=9):
data = self.data
subset = data.ix[data[group_name] == group_value, query_name].value_counts()[0:top_num]
return subset
def get_product_by_hospital(self, hospital_name, top_num=9):
return self.retrieve_query(group_name='hospital', group_value=hospital_name,
query_name='product', top_num=top_num)
def get_hospitals_by_product(self, product_name, top_num=9):
return self.retrieve_query(group_name='product', group_value=product_name,
query_name='hospital', top_num=top_num)
def get_product_by_size(self, stratum_value, top_num=9):
return self.retrieve_query(group_name='stratum', group_value=stratum_value,
query_name='product', top_num=top_num)
def get_counts(self, count_type, product_num=None, hosp_name=None):
if count_type == 'product':
return self.crosstab.ix[:, product_num]
elif count_type == 'hospital':
return self.crosstab.ix[hosp_name,:]
else:
raise Exception('invalid count type input')
def product_counts(self, product_num):
return self.get_counts('product', product_num=product_num)
def hospital_counts(self, hosp_name):
return self.get_counts('hospital', hosp_name=hosp_name)
def plot_product(self, product_num):
data = self.product_counts(product_num)
graph = [go.Bar(
x=self.crosstab.index.values.tolist(),
y=data.values,
)]
layout = go.Layout(title='Hospital Records for Product - ' + product_num)
fig = go.Figure(data=graph, layout=layout)
return py.iplot(fig)
def plot_hospital(self, hosp_name):
data = self.hospital_counts(hosp_name)
graph = [go.Bar(
x=self.crosstab.columns.values.tolist(),
y=data.values,
)]
layout = go.Layout(title='Product Counts for Hospital - ' + hosp_name)
fig = go.Figure(data=graph, layout=layout)
return py.iplot(fig)
def get_top_product(self, hospital_name):
return self.data.ix[self.data['hospital'] == hospital_name, 'product'].value_counts().index[0]
def top_product_for_hospital(self):
hosp_dict = {}
for hospital in self.data.hospital.value_counts().index:
print(hospital)
hosp_dict[hospital] = self.get_top_product(hospital)
return pd.Series([val for val in hosp_dict.values()]).value_counts()
In [53]:
cleaned = neiss_cleaner(raw)
data = neiss_query(cleaned.processed_data, cleaned.crosstab)
# data.plot_hospital('hosp_38')
# data.get_product_by_size('V')
data.top_product_for_hospital()
Out[53]:
In [26]:
# Get top ten items by size
stratums = data.data.stratum.value_counts().index.tolist()
for stratum in stratums:
print(stratum)
print(data.get_product_by_size(stratum))
In [44]:
data.data.hospital.value_counts().index
Out[44]:
In [55]:
races = raw.new_race.value_counts().index.tolist()
for race in races:
print(race)
print(data.retrieve_query('new_race', race, query_name='product'))
for sex in range(1,3):
print(sex)
print(data.retrieve_query('sex', sex, query_name='product'))
So far, product number 1807 seems to be the top product thats reported overall. But product 1842 seems to be the top item for the most number of hospitals. To be more specific, 70 out of the 82, hospitals have product 1842 and 1807 as their top product complaint. Regarding demographics, there's not much variatino. Both of these products were top product afflict regardless of gender or ethnicity.
In terms of different product among
In [57]:
len(raw.psu.value_counts().index)
Out[57]:
In [ ]:
raw.columns.values
In [29]:
data.data.columns.values[0]
Out[29]: