In [13]:
import getpass
current_user = getpass.getuser()
if current_user == "kedo":
DATA_DIR = '/home/kedo/Dropbox/CS 194-16/datascience-hanzai-keisu/src/data/'
elif current_user == "ylabur":
DATA_DIR = '/home/ylabur/git/datascience-hanzai-keisu/src/data/'
elif current_user == "Allen":
DATA_DIR = '/Users/Allen/Dropbox/CS194FinalProject/datascience-hanzai-keisu/src/data/'
else:
DATA_DIR = '.'
In [4]:
import csv
import datetime
import os
from sqlalchemy import (
create_engine,
Column,
Integer,
String,
Date,
Time,
func,
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
# create a sqlite db in memory
# pass in echo=True to debug SQL
engine = create_engine('sqlite:////' + os.path.join(DATA_DIR, 'sqlite.db'))
Session = sessionmaker(bind=engine)
class Incident(Base):
"""
An incident from the csv file.
Note that the incident number isn't a primary key, so I the row_number (from the CSV) the primary_key
"""
__tablename__ = 'incidents'
row_number = Column(Integer, primary_key=True)
number = Column(Integer)
category = Column(String(32))
description = Column(String(64))
date = Column(Date)
time = Column(Time)
pd_district = Column(String(32))
resolution = Column(String(32))
neighborhood = Column(String(32))
def __repr__(self):
return "<Row {row}, Incident #{number} '{category}'>".format(
row=self.row_number,
number=self.number,
category=self.category)
# re-create the schema in the db
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine, checkfirst=True)
def parse_sfpd_incidents_csv(filename):
with open(filename, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
reader.next() # skip the header
row_number = 0
for row in reader:
yield (row_number, row)
row_number += 1
print "Initiating import of CSV into SQLite"
session = Session()
row_generator = parse_sfpd_incidents_csv(os.path.join(DATA_DIR, 'sfpd_incidents_2013_with_neighborhoods.csv'))
for (row_number, row) in row_generator:
# sfpd_incidents_2013_with_neighborhoods.csv is in the following format:
# Incident Number,Category,Description,Date,Time,PD District,Resolution,Longitude,Latitude,Neighborhood
# note that there can be more than 1 row with the same incident number
incident = Incident(row_number=row_number,
number=row[0],
category=row[1],
description=row[2],
date=datetime.datetime.strptime(row[3], '%m/%d/%Y').date(),
time=datetime.datetime.strptime(row[4], '%H:%M').time(),
pd_district=row[5],
resolution=row[6],
neighborhood=row[9])
session.add(incident)
session.commit()
print "Finished importing CSV into SQLite"
We see which how many of each category there is in the incident DB.
In [3]:
from sqlalchemy import func
from sqlalchemy import desc
session = Session()
query = session.query(Incident.category, func.count(Incident.category)\
.label('count'))\
.group_by(Incident.category)\
.order_by(desc('count'))
incident_categories = list()
print "{:<30} {:<20}".format("Category", "Count")
for category, num_incidents in query:
incident_categories.append(category)
print "{:<30} {:<20}".format(category, num_incidents)
In [5]:
import csv
import numpy
from numpy import arange,array,ones
from pylab import plot,show
from scipy import stats
import matplotlib.pyplot as plt
# cities and their populations
cities = {u'buena vista park': 2821,
u'north waterfront': 1904,
u'forest hill': 3322,
u'inner richmond': 17074,
u'van ness - civic center': 11686,
u'lakeside': 1751,
u'mission bay': 4789,
u'midtown terrace': 3742,
u'eureka valley - dolores heights - castro': 12064,
u'miraloma park': 3648,
u'western addition': 11981,
u'lake': 8673,
u'marina': 10246,
u'ingleside terrace': 2353,
u'inner sunset': 37388,
u'castro-upper market': 2631,
u'silver terrace': 6781,
u'st. francis wood': 1318,
u'sea cliff': 829,
u'telegraph hill': 6766,
u'mission': 2874,
u'': 1,
u'russian hill': 12967,
u'parnassus - ashbury': 5340,
u'twin peaks': 3859,
u'portola': 12962,
u'mount davidson manor': 1702,
u'ocean view': 6271,
u'hayes valley': 4889,
u'central sunset': 20577,
u'presidio heights': 4091,
u'lower pacific heights': 10747,
u'forest hill extension': 3422,
u'golden gate heights': 2469,
u'westwood highlands': 1694,
u'oceanview': 6271,
u'ingleside heights': 6702,
u'noe valley': 22009,
u'outer parkside': 13857,
u'south beach': 5648,
u'central waterfront - dogpatch': 820,
u'lakeshore': 1,
u'diamond heights': 2153,
u'haight-ashbury': 10523,
u'nob hill': 17900,
u'outer richmond': 16821,
u'chinatown': 10775,
u'yerba buena': 10,
u'financial district': 9301,
u'glen park': 6066,
u'excelsior': 25999,
u'downtown': 19178,
u'forest knolls': 2889,
u'west portal': 2763,
u'westwood park': 1933,
u'outer sunset': 74129,
u'parkside': 19871,
u'cow hollow': 7314,
u'inner parkside': 5122,
u'lone mountain': 7668,
u'north panhandle': 10387,
u'bernal heights': 24430,
u'mission terrace': 1,
u'west of twin peaks': 4961,
u'south of market': 11130,
u'jordan park - laurel heights': 4170,
u'crocker amazon': 9790,
u'potrero hill': 13725,
u'ingleside': 9082,
u'pacific heights': 10747,
u'seacliff': 829,
u'corona heights': 3488,
u'outer mission': 9328,
u'merced heights': 3852,
u'visitacion valley': 23054,
u'central richmond': 25617,
u'sunnyside': 8279,
u'bayview': 17716,
u'north beach': 17675,
u'little hollywood': 983}
#crimes is a dictionary mapping of crime categories to number of incidents
crimes = {}
# incidents is a dictioanry mapping of neighborhoods to average home index values and crimes
incidents = {}
# initialize crimes to have a dict entry for each category
for value in session.query(Incident.category).distinct():
crimes[value[0]] = 0
reader = csv.reader(open('data/ZillowSFExcel.csv', 'rU'), delimiter=',', quotechar='"')
# assign the home index value and a copy of crimes to each neighborhood
for row in reader:
try:
incidents[unicode(row[0].lower())] = (int(row[1]), crimes.copy())
except:
pass
# for each crime report, tally up the corresponding crime category in the corresponding neighborhood
for instance in session.query(Incident):
if instance.neighborhood.lower() not in incidents:
temp = crimes.copy()
temp[instance.category] += 1
incidents[instance.neighborhood.lower()] = (-1, temp)
else:
temp = incidents[instance.neighborhood.lower()][1]
temp[instance.category] += 1
incidents[instance.neighborhood.lower()] = (incidents[instance.neighborhood.lower()][0], temp)
#print incidents
print "Generated incidents dictionary"
In [5]:
# function that, given the list of crime categories, will pick out the crime rate for each neighborhood for that
# selection.
# threshold is used to filter out some of the outlier-ish crime rates
# returns a pair of arrays, home index values and crime rates
def get_plots(categories=None, threshold=0.2):
if categories == None:
categories = [cat for cat in crimes.keys()]
indexes = []
crime_rates = []
for key, value in incidents.items():
num_crimes = 0
for cat in categories:
num_crimes += value[1][cat]
if key in cities:
crime_rate = num_crimes / (cities[key] + 0.0)
else:
crime_rate = -1
if crime_rate > threshold:
crime_rate = -1
if crime_rate > 0 and value[0] > 0:
crime_rates.append(crime_rate)
indexes.append(value[0])
return indexes, crime_rates
# function that does a linear regression over the datapoints generated by the above function.
# returns the r_value, p_value, std_err, and line
def run_lin_reg(x=[],y=[]):
xi = array(x)
slope, intercept, r_value, p_value, std_err = stats.linregress(xi,y)
line = slope*xi+intercept
plot(xi,line,'r-',xi,y,'o')
show()
return r_value, p_value, std_err, line
In [6]:
x, y = get_plots([u'LARCENY/THEFT', u'ROBBERY', u'BURGLARY', u'STOLEN PROPERTY'])
r_value, p_value, std_err, line = run_lin_reg(x, y)
We run a linear regression on the "crime rate" of incidents of each category in each neighborhood, and then sort it to find the weights of each category.
In [7]:
# fit a linear regression model
from sklearn import linear_model
regression = linear_model.LinearRegression()
# here, X is a list of rows of neighborhoods
# where each row is the number of incidents for a certain category in that neighborhood
# and the order of the rows is defined by incident_categories
X = []
y = []
for neighborhood, data in incidents.iteritems():
zillow_price, incidents_by_category = data
population = cities[neighborhood]
x = []
for category in incident_categories:
if category in incidents_by_category:
x.append(incidents_by_category[category] / float(population))
else:
print neighborhood, " didn't have anything for category ", category
x.append(0)
X.append(x)
y.append(zillow_price)
regression.fit(X, y)
print regression.score(X, y)
category_weights = zip(incident_categories, regression.coef_)
category_weights.sort(key=lambda x: x[1])
print "{:<30} {:<20}".format("Category", "Regression Coefficient")
for c, w in category_weights:
print "{:<30} {:<20}".format(c, w)
In [8]:
import csv
from sets import Set
with open(DATA_DIR + 'ZillowSFExcel.csv', 'rU') as csvfile:
zillowreader = csv.reader(csvfile, delimiter=',')
dictsetter = []
zillowreader.next()
for row in zillowreader:
dictsetter.append(row)
housingdict = {}
for i in range(1, len(dictsetter)):
housingdict[dictsetter[i][0]] = float(dictsetter[i][1])
with open(DATA_DIR + 'sfpd_incidents_2013_with_neighborhoods.csv', 'rU') as csvfile:
neighborhoodreader = csv.reader(csvfile, delimiter=',')
neighborhoodset = Set([])
for row in neighborhoodreader:
neighborhoodset.add(row[9])
neighborhoodset.remove('')
neighborhoodlist = list(neighborhoodset)
In [9]:
from pylab import *
boxplot(housingdict.values())
show()
In [19]:
# this cell generates some json for the map page
# the json is in the format:
# [{neighborhood_name:
# 'population': population_number,
# 'home_value_index': Zillow home value index,
# 'incident_counts': {
# 'bribery': 3,
# 'larceny': 5,...
# }
# },
# {...}
# ]
from sqlalchemy import func, desc
import json
json_obj = {}
session = Session()
query = session.query(Incident.neighborhood,
Incident.category,
func.count(Incident.category).label('count'))\
.filter(Incident.neighborhood != "")\
.group_by(Incident.neighborhood, Incident.category)\
.order_by(Incident.neighborhood, Incident.category, desc('count'))
for neighborhood, category, num_incidents in query:
if neighborhood not in json_obj:
home_value_index = incidents[neighborhood.lower()][0]
if home_value_index == -1:
home_value_index = None
json_obj[neighborhood] = {'population': cities[neighborhood.lower()],
'home_value_index': home_value_index,
'incident_counts': dict()}
json_obj[neighborhood]['incident_counts'][category] = num_incidents
json.dumps(json_obj)
with open(os.path.join(DATA_DIR, 'sf-incidents.json'), 'w') as f:
json.dump(json_obj, f, indent=4, sort_keys=True)
print "Dumped JSON to file"
In [ ]: