In [5]:
import requests as rq
import zipfile as zp
import StringIO
import os
import pandas as pd
import glob
In [17]:
def makeTime():
"""Make all the links to the data we want. Data doesn't start until 2011_08"""
year = ["2013","2012","2011"]
month = ["01","02","03","04","05","06","07","08","09","10","11","12"]
container = []
for y in year:
for m in month:
if (y != "2011") or (m not in ["01","02","03","04","05","06","07"]):
container.append(y+"_"+m)
return container
In [20]:
def fileListMaker(globPath=os.path.join("citysum","*","*.xlsx")):
container = []
for f in glob.glob(globPath):
meta = os.path.split(os.path.splitext(f)[0])
date = os.path.split(meta[0])[1]
category = meta[1].split('sum')[0]
container.append((f,date,category))
return container
In [18]:
dates = makeTime()
In [19]:
for d in dates:
r = rq.get("http://www.nyc.gov/html/nypd/downloads/zip/traffic_data/{}_sum.zip".format(d))
z = zp.ZipFile(StringIO.StringIO(r.content))
z.extractall(path="citysum/{}".format(d))
In [23]:
fileList = fileListMaker()
isFirst = True
with open('allSummons.csv','wb') as f:
for d in fileList:
raw = pd.read_excel(d[0],"Table 1", skiprows=2, index_col=None, names=["type","mtd","ytd"])
backSafely = {'type': raw.ix[0]['type'].split('\n')[1], 'mtd': raw.ix[0]['mtd'].split('\n')[1], 'ytd':raw.ix[1]['ytd']}
otherMovers = {'type': raw.ix[34]['type'].split('\n')[0], 'mtd': raw.ix[34]['mtd'].split('\n')[0], 'ytd':raw.ix[34]['ytd']}
totalMovers = {'type': raw.ix[34]['type'].split('\n')[1], 'mtd': raw.ix[34]['mtd'].split('\n')[1], 'ytd':raw.ix[35]['ytd']}
df = raw[2:34]
df = df.append([backSafely, otherMovers, totalMovers], ignore_index=True)
df['location'] = d[2]
df['date'] = d[1]
#df['year'] = df['date'].apply(lambda x: x.split('_')[0])
#df['month'] = df['date'].apply(lambda x: x.split('_')[1])
if isFirst:
df.to_csv(f, header=True, index=False)
isFirst = False
else:
df.to_csv(f, header=False, index=False)
In [ ]: