In [98]:
import requests as rq
import zipfile as zp
import StringIO
import os
import pandas as pd
import re
import numpy as np
import glob

In [99]:
def makeTime():
    """Make all the links to the data we want. Data doesn't start until 2011_08"""
    year = ["2013","2012","2011"]
    month = ["01","02","03","04","05","06","07","08","09","10","11","12"]
    container = []
    for y in year:
        for m in month:
            if (y != "2011") or (m not in ["01","02","03","04","05","06","07"]):
                container.append(y+"_"+m)
    return container

In [100]:
def getPrecinct(v):
    if isinstance(v, unicode):
        if re.match(r".*Precinct", v):
            return v
    else:
        return None

In [202]:
def countCleaner(df, pain=None):
    container = []
    for index, row in df.iterrows():
        temp = {'index': index}
        if isinstance(row[pain], unicode):
            count = row[pain].split('\n')
            if re.match(r'.*\n.*', row['category']):
                categories = row['category'].split('\n')
                categories = [c+"_"+pain for c in categories]
                temp.update(dict(zip(categories, count)))
            else:
                categories = row['category'].split(' ')
                categories = [c+"_"+pain for c in categories]
                temp.update(dict(zip(categories, count)))
        else:
            temp[row['category']+"_"+pain] = row[pain]
        container.append(temp)
    return container

In [95]:
def fileListMaker(globPath=os.path.join("cityacc","*","*.xlsx")):
    container = []
    for f in glob.glob(globPath):
        meta = os.path.splitext(f)[0]
        category = os.path.split(meta)[1]
        date = os.path.split(os.path.split(meta)[0])[1]
        container.append((f,date,category))
    return container

In [102]:
dates = makeTime()

In [103]:
for d in dates:
    try:
        r = rq.get("http://www.nyc.gov/html/nypd/downloads/zip/traffic_data/{}_acc.zip".format(d))
        z = zp.ZipFile(StringIO.StringIO(r.content))
        z.extractall(path=os.path.join("cityacc",d))
    except:
        print("this date does not have a file"+str(d))


this date does not have a file2013_12

In [232]:
fileList = fileListMaker()
isFirst = True
with open('allAccidents.csv','wb') as f:
    for d in fileList:
        if d[2] != 'cityacc':
            try:
                df = pd.read_excel(d[0], "Table 1", index_col=None, skiprows=2, names=["intersection", "acc_count", "deleteMe", "person_involved_count", "acc_w_injury_count", "category", "injured", "killed", "vehicle_types", "factors"])
                df = df.drop('deleteMe', axis=1)
            except:
                #staten island sometimes has one fewer header
                print("file has one fewer column but still being process: ", d)
                df = pd.read_excel(d[0], "Table 1", index_col=None, skiprows=2, names=["intersection", "acc_count", "person_involved_count", "acc_w_injury_count", "category", "injured", "killed", "vehicle_types", "factors"])
            df['precinct'] = df['intersection'].map(lambda x: getPrecinct(x))
            df['precinct'] = df['precinct'].fillna(method='pad')
            df = df.dropna(how='all', subset=['acc_count','person_involved_count','acc_w_injury_count','category','injured','killed','vehicle_type','factors'])
            df = df.dropna(how='all', subset=['acc_count'])
            df = df[df.intersection != 'Intersection   Address']
            df = df.reset_index(drop=True)
            count_killed = countCleaner(df, 'killed')
            count_injured = countCleaner(df, 'injured')
            killedDf = pd.DataFrame(data=count_killed).set_index('index')
            injuredDf = pd.DataFrame(data=count_injured).set_index('index')
            df = df.join(killedDf)
            df = df.join(injuredDf)
            df['location'] = d[2]
            df['date'] = d[1]
            #df['year'] = df['date'].apply(lambda x: x.split('_')[0])
            #df['month'] = df['date'].apply(lambda x: x.split('_')[1])
            #df = df.drop(["category", "injured", "killed"], axis=1)
            if isFirst:
                df.to_csv(f, header=True, index=False, encoding='utf8')
                isFirst = False
            else:
                df.to_csv(f, header=False, index=False, encoding='utf8')


('file has one fewer column but still being process: ', ('cityacc\\2011_09\\siacc.xlsx', '2011_09', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2011_10\\siacc.xlsx', '2011_10', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2011_11\\siacc.xlsx', '2011_11', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2011_12\\siacc.xlsx', '2011_12', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2012_02\\siacc.xlsx', '2012_02', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2012_03\\siacc.xlsx', '2012_03', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2012_05\\siacc.xlsx', '2012_05', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2012_06\\siacc.xlsx', '2012_06', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2012_07\\siacc.xlsx', '2012_07', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2012_09\\siacc.xlsx', '2012_09', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2013_07\\siacc.xlsx', '2013_07', 'siacc'))
('file has one fewer column but still being process: ', ('cityacc\\2013_09\\siacc.xlsx', '2013_09', 'siacc'))

In [ ]: