In [71]:
import pandas as pd
import numpy as np

In [235]:
df = pd.read_csv("../data/cdc/chlamydia.csv", usecols={"FIPS", "Rate"})
df_gon = pd.read_csv("../data/cdc/gonorrhea.csv", usecols={"FIPS", "Rate"})
df_syp = pd.read_csv("../data/cdc/syphilis.csv", usecols={"FIPS", "Rate"})

In [236]:
df.dtypes


Out[236]:
FIPS     int64
Rate    object
dtype: object

In [237]:
df_2 = pd.read_csv("../app/static/js/unemployment.tsv", delim_whitespace=True)

In [238]:
df


Out[238]:
FIPS Rate
0 1001 398.2
1 1003 341.1
2 1005 613.1
3 1007 444.2
4 1009 124.4
5 1011 939.9
6 1013 804.3
7 1015 723.9
8 1017 825.5
9 1019 366.4
10 1021 402.7
11 1023 409.7
12 1025 607.0
13 1027 393.0
14 1029 213.4
15 1031 565.4
16 1033 467.7
17 1035 550.9
18 1037 330.3
19 1039 414.4
20 1041 707.9
21 1043 319.3
22 1045 573.3
23 1047 850.1
24 1049 276.0
25 1051 538.9
26 1053 502.9
27 1055 646.6
28 1057 408.1
29 1059 314.0
... ... ...
3198 72097 211.2
3199 72099 83.2
3200 72101 59.0
3201 72103 119.0
3202 72105 67.6
3203 72107 71.5
3204 72109 126.0
3205 72111 98.4
3206 72113 167.4
3207 72115 71.9
3208 72117 60.9
3209 72119 182.3
3210 72121 128.5
3211 72123 107.1
3212 72125 83.0
3213 72127 203.5
3214 72129 194.8
3215 72131 110.1
3216 72133 126.9
3217 72135 85.5
3218 72137 110.5
3219 72139 145.0
3220 72141 74.1
3221 72143 63.7
3222 72145 73.0
3223 72147 263.4
3224 72149 94.3
3225 72151 156.1
3226 72153 92.8
3227 78000 757.4

3228 rows × 2 columns


In [239]:
df_2.dtypes


Out[239]:
id        int64
rate    float64
dtype: object

In [240]:
df.shape


Out[240]:
(3228, 2)

In [241]:
df_2.shape


Out[241]:
(3218, 2)

In [242]:
df_2[df_2["id"].isin(df["FIPS"].values)].shape


Out[242]:
(3218, 2)

In [243]:
df[df["FIPS"].isin(df_2["id"].values)].shape


Out[243]:
(3218, 2)

In [244]:
df["Rate"].sort_values()


Out[244]:
274                    0.0
2036                   0.0
2385                   0.0
1432                   0.0
941                    0.0
1414                   0.0
2612                   0.0
3136                   0.0
980                    0.0
990                    0.0
96                     0.0
551                    0.0
2656                   0.0
2016                   0.0
2657                   0.0
88                     0.0
2660                   0.0
2676                   0.0
2796                   0.0
1763                   0.0
2542                   0.0
1550                   0.0
278                    0.0
1664                   0.0
1661                   0.0
1659                   0.0
256                    0.0
1694                   0.0
1702                   0.0
1635                   0.0
               ...        
2685                  96.9
1471                 968.1
2026                  97.5
958                   97.6
447                   97.7
3091                 971.7
127                  972.5
524                  975.4
1220                  98.1
3152                  98.3
3205                  98.4
264                   98.6
3171                  98.8
1925                 984.3
1149                 986.8
2493                 986.9
2304                  99.3
1353                  99.6
925                   99.9
2941                 991.1
2713                 993.7
2399                 995.0
2919    Data not available
95      Data not available
76      Data not available
86      Data not available
3146    Data not available
87      Data not available
3148    Data not available
90      Data not available
Name: Rate, dtype: object

In [245]:
df[df['Rate'] == "Data not available"]


Out[245]:
FIPS Rate
76 2105 Data not available
86 2195 Data not available
87 2198 Data not available
90 2230 Data not available
95 2275 Data not available
2919 51515 Data not available
3146 60000 Data not available
3148 69000 Data not available

In [246]:
df['Rate'] = df['Rate'].str.replace('Data not available','316.71')
df_gon['Rate'] = df_gon['Rate'].str.replace('Data not available','65.30')
df_syp['Rate'] = df_syp['Rate'].str.replace('Data not available','2.64')

In [247]:
df


Out[247]:
FIPS Rate
0 1001 398.2
1 1003 341.1
2 1005 613.1
3 1007 444.2
4 1009 124.4
5 1011 939.9
6 1013 804.3
7 1015 723.9
8 1017 825.5
9 1019 366.4
10 1021 402.7
11 1023 409.7
12 1025 607.0
13 1027 393.0
14 1029 213.4
15 1031 565.4
16 1033 467.7
17 1035 550.9
18 1037 330.3
19 1039 414.4
20 1041 707.9
21 1043 319.3
22 1045 573.3
23 1047 850.1
24 1049 276.0
25 1051 538.9
26 1053 502.9
27 1055 646.6
28 1057 408.1
29 1059 314.0
... ... ...
3198 72097 211.2
3199 72099 83.2
3200 72101 59.0
3201 72103 119.0
3202 72105 67.6
3203 72107 71.5
3204 72109 126.0
3205 72111 98.4
3206 72113 167.4
3207 72115 71.9
3208 72117 60.9
3209 72119 182.3
3210 72121 128.5
3211 72123 107.1
3212 72125 83.0
3213 72127 203.5
3214 72129 194.8
3215 72131 110.1
3216 72133 126.9
3217 72135 85.5
3218 72137 110.5
3219 72139 145.0
3220 72141 74.1
3221 72143 63.7
3222 72145 73.0
3223 72147 263.4
3224 72149 94.3
3225 72151 156.1
3226 72153 92.8
3227 78000 757.4

3228 rows × 2 columns


In [248]:
df_test = df.convert_objects(convert_numeric=True).copy()
df_test_gon = df_gon.convert_objects(convert_numeric=True).copy()
df_test_syp = df_syp.convert_objects(convert_numeric=True).copy()
df_test.dtypes


/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':
/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app
/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  app.launch_new_instance()
Out[248]:
FIPS      int64
Rate    float64
dtype: object

In [249]:
df_test.describe()


Out[249]:
FIPS Rate
count 3228.000000 3151.000000
mean 31418.451053 317.514211
std 16353.075581 187.464002
min 1001.000000 0.000000
25% 19030.500000 184.850000
50% 30026.000000 275.300000
75% 46113.500000 415.250000
max 78000.000000 995.000000

In [250]:
df_test.dtypes, df_test_gon.dtypes, df_test_syp.dtypes


Out[250]:
(FIPS      int64
 Rate    float64
 dtype: object, FIPS      int64
 Rate    float64
 dtype: object, FIPS      int64
 Rate    float64
 dtype: object)

In [251]:
df_test_syp["Rate"].mean()


Out[251]:
2.6440892193308554

In [255]:
df_test.fillna(value=316.71, inplace=True)
df_test_gon.fillna(value=65.30, inplace=True)
df_test_syp.fillna(value=2.64, inplace=True)

In [256]:
df_test.to_csv("../app/static/js/chlamydia.csv", index=False)
df_test_gon.to_csv("../app/static/js/gonorrhea.csv", index=False)
df_test_syp.to_csv("../app/static/js/syphilis.csv", index=False)

In [261]:
df_merged = pd.merge(df_test, df_test_gon, left_on='FIPS', right_on='FIPS', how='inner', sort=False).convert_objects(convert_numeric=True)


/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':

In [263]:
df_all = pd.merge(df_merged, df_test_syp, left_on='FIPS', right_on='FIPS', how='inner', sort=False).convert_objects(convert_numeric=True)


/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':

In [264]:
df_all.head()


Out[264]:
FIPS Rate_x Rate_y Rate
0 1001 398.2 86.9 0.0
1 1003 341.1 78.2 2.6
2 1005 613.1 192.1 3.7
3 1007 444.2 97.7 0.0
4 1009 124.4 10.4 0.0

In [265]:
df_all.to_csv("../app/static/js/stds.csv", index=False)

In [ ]: