In [1]:
import pandas as pd
#All 311 requests for 2016 -import zip as a string
quality = pd.io.parsers.read_csv('finalProjectDoomed/311_Service_Requests_from_2010_to_Present.csv', dtype={'Incident Zip': 'str'})
In [2]:
# Get all New York City Zip Codes
stuff = pd.io.parsers.read_csv('finalProjectDoomed/nycZip.csv',dtype={'ZipNy': 'str'}) # Get list of all New York City Zip Codes
zipNy= stuff[['ZipNy','Borough']] # Clean up Data
zipNy = zipNy.ZipNy.unique() # remove double entriesy
#zipNy
In [3]:
# Remove all non New York City Zips
realNYC = quality[quality['Incident Zip'].isin(zipNy)] # remove outliers from the data, Only New York City Zips
realNYC = realNYC.reset_index(drop=True)
#realNYC
In [4]:
# Get Show Zip occurences
test = realNYC
test = test.rename(columns={'Incident Zip':'ZIP'})
test = test['ZIP']
test = test.value_counts()
test
Out[4]:
In [5]:
#get Lat Lon for Zips Codes
zipy = pd.io.parsers.read_csv('finalProjectDoomed/zipLatLon.csv',dtype={'ZIP': 'str'})# get Lat Lon of Zip
realZip = zipy[zipy['ZIP'].isin(zipNy)]# Only Ny City ones
realZip = realZip.reset_index(drop=True)
realZip = realZip.set_index('ZIP')
#realZip
In [6]:
# Convert back to Dataframe
test = pd.DataFrame(test).reset_index()
test = test.rename(columns={'index':'ZIP','ZIP':'NUM'})
test = test.set_index('ZIP')
#test
In [7]:
# Concat together with Lat Lon
tried = [test,realZip]
graphOut = pd.concat(tried, axis=1)
In [8]:
import numpy as np
# rename columns and remove misformed Zip codes
graphOut = graphOut.rename(columns={'LNG':'LON'})
graphOut.index.name = 'ZIP'
graphOut = graphOut[np.isfinite(graphOut['LON'])]
graphOut = graphOut[np.isfinite(graphOut['LAT'])]
graphOut = graphOut[np.isfinite(graphOut['NUM'])]
In [9]:
#Save dataset
graphOut.to_csv("graphOut.csv", sep=',')
Kmeans Finding Clusters in the Data
In [10]:
# Get unique Complaint type occurences
get = realNYC['Complaint Type'].value_counts()
get = pd.DataFrame(get).reset_index()
get = get.rename(columns={'index':'Complaint Type','Complaint Type':'index'})
#get
In [11]:
# Create a dictionary to convert complaint types to numerical value
dictGet = {}
for index, row in get.iterrows():
dictGet[row["Complaint Type"]] = index
#dictGet
In [14]:
# Pass all complaints through dictionary and store values
qualCom = realNYC['Complaint Type'].values
for (i, n) in enumerate(realNYC['Complaint Type']):
qualCom[i] = dictGet[n]
In [15]:
# rename ZIP
q = realNYC
q = q.rename(columns={'Incident Zip':'ZIP'})
q = q[q['ZIP'] >= 0] # remove malformed zip code
In [16]:
import numpy as np
from sklearn.cluster import KMeans
# Perform Kmeans 2-10
clustZip = q['ZIP'].values
X=np.matrix(zip(qualCom,clustZip))
numK = range(1,11)
results = []
for i in numK:
results.append(KMeans(n_clusters=i).fit(X))
In [17]:
#Create a new Panda with Zip column for storing kmeans output
groupPlot = q[["ZIP"]]
groupPlot = groupPlot.reset_index(drop=True)
In [19]:
#Create new columns and assign data
groupPlot["cc"] = results[0].labels_ # Array of 0's
groupPlot["k2"] = results[1].labels_
groupPlot["k3"] = results[2].labels_
groupPlot["k4"] = results[3].labels_
groupPlot["k5"] = results[4].labels_
groupPlot["k6"] = results[5].labels_
groupPlot["k7"] = results[6].labels_
groupPlot["k8"] = results[7].labels_
groupPlot["k9"] = results[8].labels_
groupPlot["k10"] = results[9].labels_
groupPlot
In [ ]:
#Add Clusters
#for i in range(1,6):
# for j, cluster in enumerate(results[i].cluster_centers_) :
# rowAdd = [cluster[1],cluster[0],1,0,0,0,0,0,0,0,0,0]
#rowAdd[i+2] = j;
#rowAdd[2] = i+1;
#groupPlot.loc[len(prostitution)] = rowAdd
In [21]:
#remove duplicates from the data so only 1 for each zip
outy = groupPlot.drop_duplicates(['ZIP'])
outy['ZipCode'] = outy['ZIP']
outy = outy.set_index('ZIP')
outy
Out[21]:
In [22]:
import numpy as np
#Combine with Zip Lat Lon data
graphStep = graphOut
fram = [graphStep,outy]
graphKm = pd.concat(fram, axis=1)
graphKm = graphKm[np.isfinite(graphKm['k2'])] #Remove misformed zipcodes data
graphKm = graphKm[np.isfinite(graphKm['LAT'])]
graphKm.to_csv("graphKm.csv", sep=',')
#graphKm
In [23]:
#Save dataset
graphKm.to_csv("graphKm.csv", sep=',')
Cluster Income Data
In [ ]:
income = pd.read_csv('incomeData.csv')
In [ ]:
incomeNy = income[income['STATE'] == "NY"]
incomeNy.to_csv("incomeNy.csv", sep=',')
In [29]:
import pandas as pd
income = pd.io.parsers.read_csv('finalProjectDoomed/incomeNy.csv', dtype={'zipcode': 'str'})
In [122]:
#income
In [30]:
incomeInc = income[income['zipcode'].isin(zipNy)] # Just New York City Zip
In [31]:
# Create data set to be used for CLustering
dfAgi = incomeInc['agi_stub'].values
dfZip = incomeInc['zipcode'].values
dfNum = incomeInc['N1'].values
print dfNum
In [32]:
# Create dataset for clustering
incY = []
incX = []
rani = len(dfAgi)
for i in range(0,rani):
ranj = int(dfNum[i])
#print ranj
for j in range(0,ranj):
incX.append(dfZip[i])
incY.append(dfAgi[i])
In [34]:
import numpy as np
from sklearn.cluster import KMeans
clustZip = incY
X=np.matrix(zip(incX,clustZip))
numK = range(1,11)
resultsInc = []
for i in numK:
resultsInc.append(KMeans(n_clusters=i).fit(X))
In [40]:
groupPlotInc = graphOut
del groupPlotInc['NUM']
groupPlotInc
In [49]:
groupInc = incX
groupInc = pd.DataFrame(groupInc).reset_index()
del groupInc['index']
groupInc = groupInc.rename(columns={'0':'Zip'})
groupInc = groupInc.reset_index(drop=True)
groupInc.columns = ['ZIP']
groupInc['ZipCode'] = groupInc['ZIP']
groupInc = groupInc.set_index('ZIP')
groupInc
In [44]:
#Create new columns and assign data
groupInc["cc"] = resultsInc[0].labels_ # Array of 0's
groupInc["k2"] = resultsInc[1].labels_
groupInc["k3"] = resultsInc[2].labels_
groupInc["k4"] = resultsInc[3].labels_
groupInc["k5"] = resultsInc[4].labels_
groupInc["k6"] = resultsInc[5].labels_
groupInc["k7"] = resultsInc[6].labels_
groupInc["k8"] = resultsInc[7].labels_
groupInc["k9"] = resultsInc[8].labels_
groupInc["k10"] = resultsInc[9].labels_
In [51]:
outInc = groupInc.drop_duplicates(['ZipCode']).reset_index()
outInc = outInc.set_index('ZIP')
#del outInc['index']
outInc
In [52]:
import numpy as np
fram = [groupPlotInc,outInc]
graphInc = pd.concat(fram, axis=1)
graphInc = graphInc[np.isfinite(graphInc['k2'])] # remove zip codes no data for
graphInc.to_csv("graphInc.csv", sep=',')
#graphInc
'Noise - Vehicle' 'Noise' 'Noise - Residential' 'Noise - Street/Sidewalk'
In [65]:
# Get all noise complaints for New York City Only
noise1 = realNYC[realNYC['Complaint Type'] == 'Noise - Vehicle']
noise2 = realNYC[realNYC['Complaint Type'] == 'Noise']
noise3 = realNYC[realNYC['Complaint Type'] == 'Noise - Residential']
noise4 = realNYC[realNYC['Complaint Type'] == 'Noise - Street/Sidewalk']
noise5 = realNYC[realNYC['Complaint Type'] == 'Noise - Commercial']
In [66]:
# Combine the different noise complaints
frames = [noise1,noise2,noise3,noise4,noise5]
noise = pd.concat(frames)
noise = noise.reset_index(drop=True)
In [150]:
noiseOut = noise.rename(columns={'Incident Zip':'ZIP'})
noiseOut = noiseOut['ZIP']
noiseOut = noiseOut.value_counts()
# Convert back to Dataframe
noiseOut = pd.DataFrame(noiseOut).reset_index()
noiseOut = noiseOut.rename(columns={'index':'ZIP','ZIP':'Noise'})
noiseOut['ZipCode'] = noiseOut['ZIP']
noiseOut = noiseOut.set_index('ZIP')
noiseOut
noiseStep = graphOut
# Concat together
noiseTry = [noiseStep,noiseOut]
noiseOutp = pd.concat(noiseTry, axis=1)
#noiseOutp
In [151]:
# Get food complaints
foodPosioning = realNYC[realNYC['Complaint Type'] == 'Food Poisoning']
foodEstablishment = realNYC[realNYC['Complaint Type'] == 'Food Establishment']
frames = [foodPosioning,foodEstablishment]
food = pd.concat(frames)
food = food.reset_index(drop=True)
In [152]:
foodOut = food.rename(columns={'Incident Zip':'ZIP'})
foodOut = foodOut['ZIP']
foodOut = foodOut.value_counts()
# Convert back to Dataframe
foodOut = pd.DataFrame(foodOut).reset_index()
foodOut = foodOut.rename(columns={'index':'ZIP','ZIP':'Food'})
foodOut = foodOut.set_index('ZIP')
foodOut
foodStep = graphOut
# Concat together
foodTry = [foodStep,foodOut]
foodOutp = pd.concat(foodTry, axis=1)
#foodOutp
In [140]:
# Homeless People
homelessPerson = realNYC[realNYC['Complaint Type'] == 'Homeless Person Assistance']
HomelessEncampment = realNYC[realNYC['Complaint Type'] == 'Homeless Encampment']
frame = [homelessPerson,HomelessEncampment]
homeless = pd.concat(frame)
homeless = homeless.reset_index(drop=True)
In [153]:
homeOut = homeless.rename(columns={'Incident Zip':'ZIP'})
homeOut = homeOut['ZIP']
homeOut = homeOut.value_counts()
# Convert back to Dataframe
homeOut = pd.DataFrame(homeOut).reset_index()
homeOut = homeOut.rename(columns={'index':'ZIP','ZIP':'Homeless'})
homeOut = homeOut.set_index('ZIP')
homeOut
homeStep = graphOut
# Concat together
homeTry = [homeStep,homeOut]
homeOutp = pd.concat(homeTry, axis=1)
#homeOutp
In [74]:
# Neighbourhood Condition
streetRoad = realNYC[realNYC['Complaint Type'] == 'Street Condition']
streetLight = realNYC[realNYC['Complaint Type'] == 'Street Light Condition']
sweeping = realNYC[realNYC['Complaint Type'] == 'Sweeping/Inadequate']
graffiti = realNYC[realNYC['Complaint Type'] == 'Graffiti']
derelictV = realNYC[realNYC['Complaint Type'] == 'Derelict Vehicle']
frames = [streetRoad,streetLight,derelictV,sweeping,graffiti]
neighbourhood = pd.concat(frames)
neighbourhood = neighbourhood.reset_index(drop=True)
In [154]:
neighOut = neighbourhood.rename(columns={'Incident Zip':'ZIP'})
neighOut = neighOut['ZIP']
neighOut = neighOut.value_counts()
# Convert back to Dataframe
neighOut = pd.DataFrame(neighOut).reset_index()
neighOut = neighOut.rename(columns={'index':'ZIP','ZIP':'Neighbourhood'})
neighOut = neighOut.set_index('ZIP')
neighOut
neighStep = graphOut
# Concat together
neighTry = [neighStep,neighOut]
neighOutp = pd.concat(neighTry, axis=1)
#neighOutp
In [142]:
# Sanitation
rodent = realNYC[realNYC['Complaint Type'] == 'Rodent']
dirty = realNYC[realNYC['Complaint Type'] == 'Dirty Conditions']
sanitationC = realNYC[realNYC['Complaint Type'] == 'Sanitation Condition']
sewer = realNYC[realNYC['Complaint Type'] == 'Sewer']
bask = realNYC[realNYC['Complaint Type'] == 'Overflowing Recycling Baskets']
unsan = realNYC[realNYC['Complaint Type'] == 'UNSANITARY CONDITION']
missed = realNYC[realNYC['Complaint Type'] == 'Missed Collection (All Materials)']
frames = [rodent,dirty,sanitationC,sewer,bask,unsan,missed]
sanitation = pd.concat(frames)
sanitation = sanitation.reset_index(drop=True)
In [211]:
sanOut = sanitation.rename(columns={'Incident Zip':'ZIP'})
sanOut = sanOut['ZIP']
sanOut = sanOut.value_counts()
#len(sanOut)
# Convert back to Dataframe
sanOut = pd.DataFrame(sanOut).reset_index()
sanOut = sanOut.rename(columns={'index':'ZIP','ZIP':'Sanitation'})
sanOut = sanOut.set_index('ZIP')
sanOut
sanStep = graphOut
# Concat together
sanTry = [sanStep,sanOut]
sanOutp = pd.concat(sanTry, axis=1)
#sanOutp
In [328]:
frameB = [sanOut,foodOut,homeOut,neighOut,noiseOut]
graphMove = pd.concat(frameB, axis=1)
In [330]:
#Fill in the holes
g2 = graphMove.fillna(0)
Out[330]:
In [304]:
# Remove any misformed zip codes
g2 = g2[g2['ZipCode'] > 0]
g2
Out[304]:
In [305]:
# Save dataset
g2.to_csv("graphMove.csv", sep=',')
In [326]:
# Heat Maps
geo_data = {'lat':neighbourhood['Latitude'].values, 'lon':neighbourhood['Longitude'].values}
#geo_data2 = {'lat':sanitation['Latitude'].values, 'lon':sanitation['Longitude'].values}
#geo_data3 = {'lat':neighbourhood['Latitude'].values, 'lon':neighbourhood['Longitude'].values}
#geo_data
In [327]:
import geoplotlib as gp
from geoplotlib.utils import BoundingBox
# Prepare data for bounding box
max_lat = max(geo_data['lat'])
print max_lat
min_lon = min(geo_data['lon'])
#print min_lon
min_lat = min(geo_data['lat'])
#print min_lat
max_lon = max(geo_data['lon'])
#print max_lon
# Create and set bounding box for map of San Francisko
bbox = BoundingBox(north=max_lat, west=min_lon, south=min_lat, east=max_lon)
gp.set_bbox(bbox)
gp.kde(geo_data,5)
gp.show()
In [ ]:
# Prepare data for bounding box
max_lat2 = max(geo_data2['lat'])
min_lon2 = min(geo_data2['lon'])
#print min_lon
min_lat2 = min(geo_data2['lat'])
#print min_lat
max_lon2 = max(geo_data2['lon'])
#print max_lon
# Create and set bounding box for map of San Francisko
bbox = BoundingBox(north=max_lat2, west=min_lon2, south=min_lat2, east=max_lon2)
gp.set_bbox(bbox)
gp.kde(geo_data2,5)
gp.show()
In [ ]:
import geoplotlib as gp
from geoplotlib.utils import BoundingBox
# Prepare data for bounding box
max_lat3 = max(geo_data['lat'])
min_lon3 = min(geo_data['lon'])
#print min_lon
min_lat3 = min(geo_data['lat'])
#print min_lat
max_lon3 = max(geo_data['lon'])
#print max_lon
# Create and set bounding box for map of San Francisko
bbox = BoundingBox(north=max_lat3, west=min_lon3, south=min_lat3, east=max_lon3)
gp.set_bbox(bbox)
gp.kde(geo_data3,3)
gp.show()