In [98]:
import requests as rq
import zipfile as zp
import StringIO
import os
import pandas as pd
import re
import numpy as np
import glob
In [99]:
def makeTime():
"""Make all the links to the data we want. Data doesn't start until 2011_08"""
year = ["2013","2012","2011"]
month = ["01","02","03","04","05","06","07","08","09","10","11","12"]
container = []
for y in year:
for m in month:
if (y != "2011") or (m not in ["01","02","03","04","05","06","07"]):
container.append(y+"_"+m)
return container
In [100]:
def getPrecinct(v):
if isinstance(v, unicode):
if re.match(r".*Precinct", v):
return v
else:
return None
In [202]:
def countCleaner(df, pain=None):
container = []
for index, row in df.iterrows():
temp = {'index': index}
if isinstance(row[pain], unicode):
count = row[pain].split('\n')
if re.match(r'.*\n.*', row['category']):
categories = row['category'].split('\n')
categories = [c+"_"+pain for c in categories]
temp.update(dict(zip(categories, count)))
else:
categories = row['category'].split(' ')
categories = [c+"_"+pain for c in categories]
temp.update(dict(zip(categories, count)))
else:
temp[row['category']+"_"+pain] = row[pain]
container.append(temp)
return container
In [95]:
def fileListMaker(globPath=os.path.join("cityacc","*","*.xlsx")):
container = []
for f in glob.glob(globPath):
meta = os.path.splitext(f)[0]
category = os.path.split(meta)[1]
date = os.path.split(os.path.split(meta)[0])[1]
container.append((f,date,category))
return container
In [102]:
dates = makeTime()
In [103]:
for d in dates:
try:
r = rq.get("http://www.nyc.gov/html/nypd/downloads/zip/traffic_data/{}_acc.zip".format(d))
z = zp.ZipFile(StringIO.StringIO(r.content))
z.extractall(path=os.path.join("cityacc",d))
except:
print("this date does not have a file"+str(d))
In [232]:
fileList = fileListMaker()
isFirst = True
with open('allAccidents.csv','wb') as f:
for d in fileList:
if d[2] != 'cityacc':
try:
df = pd.read_excel(d[0], "Table 1", index_col=None, skiprows=2, names=["intersection", "acc_count", "deleteMe", "person_involved_count", "acc_w_injury_count", "category", "injured", "killed", "vehicle_types", "factors"])
df = df.drop('deleteMe', axis=1)
except:
#staten island sometimes has one fewer header
print("file has one fewer column but still being process: ", d)
df = pd.read_excel(d[0], "Table 1", index_col=None, skiprows=2, names=["intersection", "acc_count", "person_involved_count", "acc_w_injury_count", "category", "injured", "killed", "vehicle_types", "factors"])
df['precinct'] = df['intersection'].map(lambda x: getPrecinct(x))
df['precinct'] = df['precinct'].fillna(method='pad')
df = df.dropna(how='all', subset=['acc_count','person_involved_count','acc_w_injury_count','category','injured','killed','vehicle_type','factors'])
df = df.dropna(how='all', subset=['acc_count'])
df = df[df.intersection != 'Intersection Address']
df = df.reset_index(drop=True)
count_killed = countCleaner(df, 'killed')
count_injured = countCleaner(df, 'injured')
killedDf = pd.DataFrame(data=count_killed).set_index('index')
injuredDf = pd.DataFrame(data=count_injured).set_index('index')
df = df.join(killedDf)
df = df.join(injuredDf)
df['location'] = d[2]
df['date'] = d[1]
#df['year'] = df['date'].apply(lambda x: x.split('_')[0])
#df['month'] = df['date'].apply(lambda x: x.split('_')[1])
#df = df.drop(["category", "injured", "killed"], axis=1)
if isFirst:
df.to_csv(f, header=True, index=False, encoding='utf8')
isFirst = False
else:
df.to_csv(f, header=False, index=False, encoding='utf8')
In [ ]: