In [2]:
import numpy as np
import pandas as pd

In [3]:
# Set some Pandas options
pd.set_option('html', False)
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 10)

TODO: Systematic way to mine all lengths and fields


In [4]:
widths=[6,6,55,1,1,     1,1,1,1,1,
        1,5,2,1,1,      1,2,5,2,3,
        2,1,4,2,2,      1,1,2,2,3,
        12,12,12,12,12, 12,12,12,12,12,
        12,12,12,12,12, 12,12,12,12,12,
        12,12,12,12,12, 12,12,12,12,12,
        12,12,12,12,12, 12,6,6,6,6,
        6,6,6,6,6, 6,6,6,6,6,
        6,6,6,6,6, 6,6,6,6,6,
        6,6,7,4,5, 7,4,5,7,4,
        5,7,4,5,7, 4,5,7,4,5,
        7,4,5,7,4, 5,7,4,5,7,
        4,5,7,4,5, 7,4,5,7,4,
        5,7,4,5,7, 4,5,7,4,5,
        7,4,5,7,4, 5,7,4,5,7,
        4,5,7,4,5, 7,4,5,7,4,
        5,7,4,5,7, 4,5,]

len(widths)


Out[4]:
167

In [5]:
names=range(1,168)
names[1-1]='Discharge'
names[2-1]='THCIC_ID'
names[3-1]='Provider_Name'

names[14-1]='Sex'

names[17-1]='Pat_State'
names[18-1]='Pat_ZIP'

names[22-1]='Admit_Weedkay'
names[23-1]='Length_of_Stay'
names[24-1]='Pat_Age'
names[25-1]='Pat_Status'
names[26-1]='Race'

names[61-1]='Total_Charges'

names[67-1]='Admitting_Diagnosis'
names[68-1]='Princ_Diag_Code'

for id in range(69-1,92-1+1):
    names[id]='Oth_Diag_Code_'+str(id-67)
    
#names[250-1]='Risk_Mortality'

In [6]:
data = pd.read_fwf('/var/datasets/dshs/CD2007Q1/PUDF_base1q2007.txt', 
                    widths=widths, names=names,
                    header=None)

In [7]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 740288 entries, 0 to 740287
Data columns (total 167 columns):
Discharge              object
THCIC_ID               int64
Provider_Name          object
4                      object
5                      object
6                      object
7                      object
8                      object
9                      object
10                     object
11                     object
12                     object
13                     int64
Sex                    object
15                     object
16                     object
Pat_State              object
Pat_ZIP                object
19                     object
20                     float64
21                     float64
Admit_Weedkay          int64
Length_of_Stay         int64
Pat_Age                int64
Pat_Status             object
Race                   object
27                     object
28                     object
29                     object
30                     int64
31                     float64
32                     float64
33                     float64
34                     float64
35                     float64
36                     float64
37                     float64
38                     float64
39                     float64
40                     float64
41                     float64
42                     float64
43                     float64
44                     float64
45                     float64
46                     float64
47                     float64
48                     float64
49                     float64
50                     float64
51                     float64
52                     float64
53                     float64
54                     float64
55                     float64
56                     float64
57                     float64
58                     float64
59                     float64
60                     float64
Total_Charges          float64
62                     float64
63                     float64
64                     float64
65                     float64
66                     float64
Admitting_Diagnosis    object
Princ_Diag_Code        object
Oth_Diag_Code_1        object
Oth_Diag_Code_2        object
Oth_Diag_Code_3        object
Oth_Diag_Code_4        object
Oth_Diag_Code_5        object
Oth_Diag_Code_6        object
Oth_Diag_Code_7        object
Oth_Diag_Code_8        object
Oth_Diag_Code_9        object
Oth_Diag_Code_10       object
Oth_Diag_Code_11       object
Oth_Diag_Code_12       object
Oth_Diag_Code_13       object
Oth_Diag_Code_14       object
Oth_Diag_Code_15       object
Oth_Diag_Code_16       object
Oth_Diag_Code_17       object
Oth_Diag_Code_18       object
Oth_Diag_Code_19       object
Oth_Diag_Code_20       object
Oth_Diag_Code_21       object
Oth_Diag_Code_22       object
Oth_Diag_Code_23       object
Oth_Diag_Code_24       object
93                     float64
94                     float64
95                     object
96                     float64
97                     float64
98                     object
99                     float64
100                    float64
101                    object
102                    float64
103                    float64
104                    object
105                    float64
106                    float64
107                    float64
108                    float64
109                    float64
110                    float64
111                    float64
112                    float64
113                    float64
114                    float64
115                    float64
116                    float64
117                    float64
118                    float64
119                    float64
120                    float64
121                    float64
122                    float64
123                    float64
124                    float64
125                    float64
126                    float64
127                    float64
128                    float64
129                    float64
130                    float64
131                    float64
132                    float64
133                    float64
134                    float64
135                    float64
136                    float64
137                    float64
138                    float64
139                    float64
140                    float64
141                    float64
142                    float64
143                    float64
144                    float64
145                    float64
146                    float64
147                    float64
148                    float64
149                    float64
150                    float64
151                    float64
152                    float64
153                    float64
154                    float64
155                    float64
156                    float64
157                    float64
158                    float64
159                    float64
160                    float64
161                    float64
162                    float64
163                    float64
164                    float64
165                    float64
166                    float64
167                    float64
dtypes: float64(109), int64(6), object(52)

In [22]:
data.to_hdf('/var/datasets/dshs/CD2007Q1/PUDF_base1q2007.h5','data',mode='w')


/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  warnings.warn(ws, PerformanceWarning)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_items] [items->None]

  warnings.warn(ws, PerformanceWarning)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_items] [items->None]

  warnings.warn(ws, PerformanceWarning)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['Discharge', 'Provider_Name', 4, 5, 6, 7, 8, 9, 10, 11, 12, 'Sex', 15, 16, 'Pat_State', 'Pat_ZIP', 19, 'Pat_Status', 'Race', 27, 28, 29, 'Admitting_Diagnosis', 'Princ_Diag_Code', 'Oth_Diag_Code_1', 'Oth_Diag_Code_2', 'Oth_Diag_Code_3', 'Oth_Diag_Code_4', 'Oth_Diag_Code_5', 'Oth_Diag_Code_6', 'Oth_Diag_Code_7', 'Oth_Diag_Code_8', 'Oth_Diag_Code_9', 'Oth_Diag_Code_10', 'Oth_Diag_Code_11', 'Oth_Diag_Code_12', 'Oth_Diag_Code_13', 'Oth_Diag_Code_14', 'Oth_Diag_Code_15', 'Oth_Diag_Code_16', 'Oth_Diag_Code_17', 'Oth_Diag_Code_18', 'Oth_Diag_Code_19', 'Oth_Diag_Code_20', 'Oth_Diag_Code_21', 'Oth_Diag_Code_22', 'Oth_Diag_Code_23', 'Oth_Diag_Code_24', 95, 98, 101, 104]]

  warnings.warn(ws, PerformanceWarning)
/usr/local/lib/python2.7/dist-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_items] [items->None]

  warnings.warn(ws, PerformanceWarning)

In [ ]: