In [93]:
import bokeh
import re
import requests
import subprocess
import os

In [94]:
import pandas as pd

In [95]:
RAW_FIXTURES_DIR = "./fixtures/raw"

In [96]:
CI_pattern = '95%(.)*'
Z_pattern = 'Z-(.)*'

In [117]:
urls = {'2017':"http://www.countyhealthrankings.org/sites/default/files/state/downloads/2017%20County%20Health%20Rankings%20Indiana%20Data%20-%20v1.xls",
        '2016':"http://www.countyhealthrankings.org/sites/default/files/state/downloads/2016%20County%20Health%20Rankings%20Indiana%20Data%20-%20v3.xls",
        '2015':"http://www.countyhealthrankings.org/sites/default/files/state/downloads/2015%20County%20Health%20Rankings%20Indiana%20Data%20-%20v3.xls",
        '2014':"http://www.countyhealthrankings.org/sites/default/files/state/downloads/2014%20County%20Health%20Rankings%20Indiana%20Data%20-%20v6.xls",
       '2013':"http://www.countyhealthrankings.org/sites/default/files/state/downloads/2013%20County%20Health%20Ranking%20Indiana%20Data%20-%20v1_0.xls",
       "2012":"http://www.countyhealthrankings.org/sites/default/files/state/downloads/2012%20County%20Health%20Ranking%20Indiana%20Data%20-%20v2.xls",
       "2011": "http://www.countyhealthrankings.org/sites/default/files/state/downloads/2011%20County%20Health%20Ranking%20Indiana%20Data%20-%20v2.xls"}

In [118]:
CI_re = re.compile(CI_pattern)
Z_re = re.compile(Z_pattern)
regex_list = [CI_re, Z_re]

In [125]:
cols_to_keep=['County', 'Teen Births', 'Teen Birth Rate']

In [126]:
def get_file(url, filename):
    try:
        subprocess.call(['wget', url, '-O', filename])
    except Exception as e:
        print(filename, "failed.")
    else:
        return filename

In [127]:
def drop_bad_cols(df, regex_patterns=[]):
    bad_cols_ci = []
    for idx, col in enumerate(df.columns):
        for each_regex in regex_patterns:
            if re.search(each_regex, col):
                bad_cols_ci.append(idx)
    
    for bad_col in sorted(bad_cols_ci, reverse=True):
        df = df.drop(df.columns[bad_col], axis=1, inplace=False)

    return df

In [128]:
def keep_good_cols(df, cols_to_keep=[]):
    return df[cols_to_keep]

In [129]:
def load_dataset(year, url, sheetname="Ranked Measure Data", skiprows=1, bad_col_regex=list(), good_cols=list(), **kwargs):
    try:
        file_path = "{path}/teenbirth_{year}.xls".format(path = RAW_FIXTURES_DIR, year = year)
        url = url

        path_to_file = get_file(url = url, filename = file_path)
        df = pd.read_excel(path_to_file, sheetname=sheetname, skiprows=skiprows, **kwargs)
        df.columns = [col.strip() for col in df.columns]
        df = drop_bad_cols(df, regex_patterns=bad_col_regex)
        df = keep_good_cols(df, cols_to_keep = good_cols)
        df['year'] = year
    except KeyError as ke:
        print(year)
        print(df.columns)
        raise
    else:
        return df

In [130]:
dfs = {}
for year, url in urls.items():
    dfs[year] = load_dataset(year, url, bad_col_regex=regex_list, good_cols = cols_to_keep)

In [131]:
master_df = pd.concat([df for df in dfs.values()])

In [132]:
master_df.to_csv("./fixtures/clean/teen_births.csv", sep = ',')

In [133]:
master_df.head


Out[133]:
<bound method NDFrame.head of          County  Teen Births  Teen Birth Rate  year
0           NaN      56051.0        35.158534  2017
1         Adams        268.0        31.029293  2017
2         Allen       3147.0        35.775186  2017
3   Bartholomew        832.0        47.898676  2017
4        Benton         62.0        31.360647  2017
5     Blackford        132.0        46.643110  2017
6         Boone        242.0        16.980073  2017
7         Brown         75.0        23.098245  2017
8       Carroll        118.0        24.471174  2017
9          Cass        425.0        48.185941  2017
10        Clark       1009.0        43.381057  2017
11         Clay        247.0        38.940564  2017
12      Clinton        349.0        46.030071  2017
13     Crawford         98.0        40.279490  2017
14      Daviess        299.0        39.471947  2017
15     Dearborn        329.0        28.943433  2017
16      Decatur        264.0        45.220966  2017
17       DeKalb        380.0        37.189274  2017
18     Delaware        968.0        23.690651  2017
19       Dubois        273.0        27.096774  2017
20      Elkhart       2299.0        47.264653  2017
21      Fayette        314.0        59.503506  2017
22        Floyd        582.0        32.650771  2017
23     Fountain        180.0        47.108087  2017
24     Franklin        147.0        25.793999  2017
25       Fulton        200.0        44.014085  2017
26       Gibson        286.0        36.465638  2017
27        Grant        759.0        35.922192  2017
28       Greene        288.0        40.580527  2017
29     Hamilton        725.0        10.920813  2017
..          ...          ...              ...   ...
65        Posey        195.0        27.700000  2011
66      Pulaski        136.0        41.700000  2011
67       Putnam        385.0        38.300000  2011
68     Randolph        293.0        52.800000  2011
69       Ripley        282.0        45.900000  2011
70         Rush        221.0        52.500000  2011
71   St. Joseph       3106.0        42.300000  2011
72        Scott        349.0        70.800000  2011
73       Shelby        489.0        49.100000  2011
74      Spencer        167.0        33.700000  2011
75       Starke        329.0        59.400000  2011
76      Steuben        389.0        48.600000  2011
77     Sullivan        220.0        48.600000  2011
78  Switzerland         93.0        42.100000  2011
79   Tippecanoe       1262.0        23.900000  2011
80       Tipton        145.0        39.700000  2011
81        Union         74.0        44.200000  2011
82  Vanderburgh       2011.0        44.900000  2011
83   Vermillion        133.0        39.200000  2011
84         Vigo       1170.0        40.700000  2011
85       Wabash        370.0        40.300000  2011
86       Warren         60.0        31.000000  2011
87      Warrick        391.0        29.300000  2011
88   Washington        305.0        48.100000  2011
89        Wayne        908.0        56.200000  2011
90        Wells        186.0        28.400000  2011
91        White        266.0        50.300000  2011
92      Whitley        262.0        36.300000  2011
93          NaN          NaN              NaN  2011
94          NaN          NaN              NaN  2011

[661 rows x 4 columns]>

In [ ]: