In [1]:
import dask.dataframe as dd
import pandas as pd
import dask.multiprocessing

import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
# from dask.distributed import Executor
# e = Executor(set_as_default=True)

In [2]:
raw = pd.read_csv('/home/datauser/cpsc/data/processed/neiss/neiss-2015.csv')
raw.head()


Out[2]:
CPSC Case # trmt_date psu weight stratum age sex race race_other diag diag_other body_part disposition location fmv prod1 prod2 narr1 narr2
0 150733174 7/11/15 38 15.7762 V 5 1 0 NaN 57 NaN 33 1 9 0 1267 NaN 5 YR OLD MALE PLAYING SOCCER BLOCKING BALL AND... X IT
1 150734723 7/6/15 43 83.2157 S 36 1 1 NaN 57 NaN 34 1 1 0 1439 4057.0 L WRIST FX(?)/36YOWM 2 WKS AGO@G-FRIENDS HM,MO... AN OBJECT&TWISTED WRIST TRYING TO CONTOL MOWER...
2 150817487 8/2/15 51 74.8813 L 20 2 0 NaN 71 OTITIS EXTERNA 94 1 0 0 3274 NaN 20 YO F C/O EAR PAIN 1 DAY SAS WAS SWIMMING YE... T TRIED TO USE ***S TO REMOVE WAX MADE IT WORS...
3 150717776 6/26/15 41 15.7762 V 61 1 0 NaN 71 PAIN 35 1 0 0 611 NaN 61YOM W/KNEE PAIN S/P FALLING IN SHOWER 2 DAYS... NaN
4 150721694 7/4/15 42 74.8813 L 88 2 3 HISPANIC 62 NaN 75 1 0 0 1893 1807.0 88YOF PAIN TO HEAD WHEN FALL TO FLOOR WHEN WAL... DOOR AND FELL.DENIES LOC. DX HEAD INJURY

In [ ]:
# top ten hospitals, one seems to be out
raw.psu.value_counts()[0:9]

In [58]:
#top ten most reported products
raw.prod1.value_counts()[0:9]


Out[58]:
1842    28712
1807    28351
4076    16784
1205    14147
5040    12787
1211    11664
4074     8271
1884     7783
1893     7723
Name: prod1, dtype: int64

In [52]:
class neiss_cleaner(object):
    
    def __init__(self, data):
        self.data = data
    
    @staticmethod
    def append_string(append_type, row):
        if append_type == 'hospital':
            return 'hosp_' + row['psu'].__str__()
        elif append_type == 'product':
            return 'product_' + row['prod1'].__str__()
        else:
            raise Exception('not valid append type')

    @staticmethod
    def recode_race(row):
        if row['race'] == 1:
            return 'white'
        elif row['race'] == 2:
            return 'black'
        elif row['race'] == 3 and row['race_other'] == 'HISPANIC':
            return 'hispanic'
        else:
            return 'other'
    
    @property
    def processed_data(self):
        data = self.data
        data['hospital'] = data.apply(lambda x: self.append_string('hospital', x), axis=1)
        data['product'] = data.apply(lambda x: self.append_string('product', x), axis=1)
        data['new_race'] = raw.apply(lambda x: self.recode_race(x), axis=1)
        return data
    
    @property
    def crosstab(self):
        grouped = pd.crosstab(self.data['hospital'], self.data['product'])
        return grouped
        
class neiss_query(object):
    
    def __init__(self, cleaned_data, crosstab):
        self.data = cleaned_data
        self.crosstab = crosstab
        
    def retrieve_query(self, group_name, group_value, query_name, top_num=9):
        data = self.data
        subset = data.ix[data[group_name] == group_value, query_name].value_counts()[0:top_num]
        return subset

    def get_product_by_hospital(self, hospital_name, top_num=9):
        return self.retrieve_query(group_name='hospital', group_value=hospital_name,
                                   query_name='product', top_num=top_num)
    
    def get_hospitals_by_product(self, product_name, top_num=9):
        return self.retrieve_query(group_name='product', group_value=product_name,
                                   query_name='hospital', top_num=top_num)
    
    def get_product_by_size(self, stratum_value, top_num=9):
        return self.retrieve_query(group_name='stratum', group_value=stratum_value,
                                  query_name='product', top_num=top_num)
    
    def get_counts(self, count_type, product_num=None, hosp_name=None):
        if count_type == 'product':
            return self.crosstab.ix[:, product_num]
        elif count_type == 'hospital':
            return self.crosstab.ix[hosp_name,:]
        else:
            raise Exception('invalid count type input')
            
    def product_counts(self, product_num):
        return self.get_counts('product', product_num=product_num)
    
    def hospital_counts(self, hosp_name):
        return self.get_counts('hospital', hosp_name=hosp_name)
    
    def plot_product(self, product_num):
        data = self.product_counts(product_num)
        graph = [go.Bar(
            x=self.crosstab.index.values.tolist(),
            y=data.values,
        )]
        layout = go.Layout(title='Hospital Records for Product - ' + product_num)
        fig = go.Figure(data=graph, layout=layout)
        return py.iplot(fig)
    
    def plot_hospital(self, hosp_name):
        data = self.hospital_counts(hosp_name)
        graph = [go.Bar(
            x=self.crosstab.columns.values.tolist(),
            y=data.values,
        )]
        layout = go.Layout(title='Product Counts for Hospital - ' + hosp_name)
        fig = go.Figure(data=graph, layout=layout)
        return py.iplot(fig)
    
    def get_top_product(self, hospital_name):
        return self.data.ix[self.data['hospital'] == hospital_name, 'product'].value_counts().index[0]
    
    def top_product_for_hospital(self):
        hosp_dict = {}
        for hospital in self.data.hospital.value_counts().index:
            print(hospital)
            hosp_dict[hospital] = self.get_top_product(hospital)
        return pd.Series([val for val in hosp_dict.values()]).value_counts()

In [53]:
cleaned = neiss_cleaner(raw)
data = neiss_query(cleaned.processed_data, cleaned.crosstab)
# data.plot_hospital('hosp_38')
# data.get_product_by_size('V')
data.top_product_for_hospital()


hosp_21
hosp_95
hosp_20
hosp_8
hosp_31
hosp_17
hosp_3
hosp_89
hosp_38
hosp_61
hosp_73
hosp_67
hosp_90
hosp_32
hosp_42
hosp_63
hosp_51
hosp_24
hosp_14
hosp_54
hosp_58
hosp_2
hosp_41
hosp_5
hosp_64
hosp_57
hosp_44
hosp_23
hosp_77
hosp_78
hosp_50
hosp_37
hosp_25
hosp_52
hosp_91
hosp_40
hosp_22
hosp_18
hosp_26
hosp_68
hosp_93
hosp_16
hosp_34
hosp_101
hosp_33
hosp_43
hosp_66
hosp_65
hosp_82
hosp_53
hosp_56
hosp_98
hosp_7
hosp_48
hosp_70
hosp_27
hosp_4
hosp_92
hosp_99
hosp_29
hosp_96
hosp_79
hosp_10
hosp_19
hosp_72
hosp_6
hosp_62
hosp_85
hosp_12
hosp_46
hosp_28
hosp_94
hosp_39
hosp_84
hosp_11
hosp_74
hosp_86
hosp_75
hosp_59
hosp_9
hosp_81
hosp_45
Out[53]:
product_1842    42
product_1807    28
product_4076     3
product_3299     2
product_5040     2
product_1211     2
product_1267     2
product_3283     1
dtype: int64

In [26]:
# Get top ten items by size
stratums = data.data.stratum.value_counts().index.tolist()
for stratum in stratums:
    print(stratum)
    print(data.get_product_by_size(stratum))


V
product_1842    13753
product_1807    12561
product_4076     6894
product_1205     6165
product_5040     5548
product_3299     3990
product_1211     3861
product_464      3688
product_1884     3603
Name: product, dtype: int64
C
product_1211    3874
product_1842    3389
product_1205    3313
product_4076    3253
product_1807    2546
product_5040    2394
product_1267    2314
product_4057    1896
product_1893    1733
Name: product, dtype: int64
L
product_1807    4611
product_1842    4516
product_4076    2607
product_5040    2005
product_1205    1845
product_1211    1409
product_1884    1193
product_4074    1181
product_464     1155
Name: product, dtype: int64
M
product_1807    4703
product_1842    3813
product_4076    2184
product_1205    1590
product_5040    1476
product_1211    1178
product_4074    1119
product_611     1092
product_3299    1034
Name: product, dtype: int64
S
product_1807    3930
product_1842    3241
product_4076    1846
product_5040    1364
product_1211    1342
product_1205    1234
product_464     1130
product_4074     990
product_611      912
Name: product, dtype: int64

In [44]:
data.data.hospital.value_counts().index


Out[44]:
Index(['hosp_21', 'hosp_95', 'hosp_20', 'hosp_8', 'hosp_31', 'hosp_17',
       'hosp_3', 'hosp_89', 'hosp_38', 'hosp_61', 'hosp_73', 'hosp_67',
       'hosp_90', 'hosp_32', 'hosp_42', 'hosp_63', 'hosp_51', 'hosp_24',
       'hosp_14', 'hosp_54', 'hosp_58', 'hosp_2', 'hosp_41', 'hosp_5',
       'hosp_64', 'hosp_57', 'hosp_44', 'hosp_23', 'hosp_77', 'hosp_78',
       'hosp_50', 'hosp_37', 'hosp_25', 'hosp_52', 'hosp_91', 'hosp_40',
       'hosp_22', 'hosp_18', 'hosp_26', 'hosp_68', 'hosp_93', 'hosp_16',
       'hosp_34', 'hosp_101', 'hosp_33', 'hosp_43', 'hosp_66', 'hosp_65',
       'hosp_82', 'hosp_53', 'hosp_56', 'hosp_98', 'hosp_7', 'hosp_48',
       'hosp_70', 'hosp_27', 'hosp_4', 'hosp_92', 'hosp_99', 'hosp_29',
       'hosp_96', 'hosp_79', 'hosp_10', 'hosp_19', 'hosp_72', 'hosp_6',
       'hosp_62', 'hosp_85', 'hosp_12', 'hosp_46', 'hosp_28', 'hosp_94',
       'hosp_39', 'hosp_84', 'hosp_11', 'hosp_74', 'hosp_86', 'hosp_75',
       'hosp_59', 'hosp_9', 'hosp_81', 'hosp_45'],
      dtype='object')

In [55]:
races = raw.new_race.value_counts().index.tolist()
for race in races:
    print(race)
    print(data.retrieve_query('new_race', race, query_name='product'))
    
for sex in range(1,3):
    print(sex)
    print(data.retrieve_query('sex', sex, query_name='product'))


other
product_1807    13378
product_1842    11999
product_4076     7229
product_1205     5529
product_5040     5324
product_1211     4102
product_3299     3887
product_4074     3381
product_611      3199
Name: product, dtype: int64
white
product_1807    11719
product_1842    11388
product_4076     6523
product_5040     5026
product_1211     4125
product_1205     3867
product_4074     3560
product_464      3104
product_4057     3008
Name: product, dtype: int64
black
product_1842    4389
product_1205    4267
product_1211    3048
product_1807    2615
product_4076    2302
product_5040    1882
product_1893    1462
product_1884    1332
product_4057    1039
Name: product, dtype: int64
hispanic
product_1842    936
product_1267    798
product_4076    730
product_1807    639
product_5040    555
product_1205    484
product_1211    389
product_1884    352
product_4074    310
Name: product, dtype: int64
1
product_1842    11611
product_1205    11533
product_1211    10984
product_1807    10823
product_5040     9288
product_4076     7447
product_1267     4853
product_1884     4619
product_464      4256
Name: product, dtype: int64
2
product_1807    17528
product_1842    17100
product_4076     9337
product_4074     4604
product_611      4255
product_1893     3991
product_5040     3499
product_4057     3363
product_3299     3331
Name: product, dtype: int64

Findings

Overall

So far, product number 1807 seems to be the top product thats reported overall. But product 1842 seems to be the top item for the most number of hospitals. To be more specific, 70 out of the 82, hospitals have product 1842 and 1807 as their top product complaint. Regarding demographics, there's not much variatino. Both of these products were top product afflict regardless of gender or ethnicity.

Sizes

In terms of different product among


In [57]:
len(raw.psu.value_counts().index)


Out[57]:
82

In [ ]:
raw.columns.values

In [29]:
data.data.columns.values[0]


Out[29]:
'CPSC Case #'