In [13]:
import getpass
current_user = getpass.getuser()
if current_user == "kedo":
    DATA_DIR = '/home/kedo/Dropbox/CS 194-16/datascience-hanzai-keisu/src/data/'
elif current_user == "ylabur":
    DATA_DIR = '/home/ylabur/git/datascience-hanzai-keisu/src/data/'
elif current_user == "Allen":
    DATA_DIR = '/Users/Allen/Dropbox/CS194FinalProject/datascience-hanzai-keisu/src/data/'
else:
    DATA_DIR = '.'

In [4]:
import csv
import datetime
import os
from sqlalchemy import (
    create_engine,
    Column,
    Integer,
    String,
    Date,
    Time,
    func,
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

Base = declarative_base()
# create a sqlite db in memory
# pass in echo=True to debug SQL
engine = create_engine('sqlite:////' + os.path.join(DATA_DIR, 'sqlite.db'))
Session = sessionmaker(bind=engine)

class Incident(Base):
    """
    An incident from the csv file.
    Note that the incident number isn't a primary key, so I the row_number (from the CSV) the primary_key
    """
    __tablename__ = 'incidents'
    
    row_number = Column(Integer, primary_key=True)
    number = Column(Integer)
    category = Column(String(32))
    description = Column(String(64))
    date = Column(Date)
    time = Column(Time)
    pd_district = Column(String(32))
    resolution = Column(String(32))
    neighborhood = Column(String(32))
    
    def __repr__(self):
        return "<Row {row}, Incident #{number} '{category}'>".format(
            row=self.row_number,
            number=self.number,
            category=self.category)

# re-create the schema in the db
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine, checkfirst=True)

def parse_sfpd_incidents_csv(filename):
    with open(filename, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        reader.next() # skip the header
        row_number = 0
        for row in reader:
            yield (row_number, row)
            row_number += 1

print "Initiating import of CSV into SQLite"
session = Session()
row_generator = parse_sfpd_incidents_csv(os.path.join(DATA_DIR, 'sfpd_incidents_2013_with_neighborhoods.csv'))
for (row_number, row) in row_generator:
    # sfpd_incidents_2013_with_neighborhoods.csv is in the following format:
    # Incident Number,Category,Description,Date,Time,PD District,Resolution,Longitude,Latitude,Neighborhood
    # note that there can be more than 1 row with the same incident number
    incident = Incident(row_number=row_number,
                        number=row[0],
                        category=row[1],
                        description=row[2],
                        date=datetime.datetime.strptime(row[3], '%m/%d/%Y').date(),
                        time=datetime.datetime.strptime(row[4], '%H:%M').time(),
                        pd_district=row[5],
                        resolution=row[6],
                        neighborhood=row[9])
    session.add(incident)
session.commit()
print "Finished importing CSV into SQLite"


Initiating import of CSV into SQLite
Finished importing CSV into SQLite

We see which how many of each category there is in the incident DB.


In [3]:
from sqlalchemy import func
from sqlalchemy import desc
session = Session()
query = session.query(Incident.category, func.count(Incident.category)\
                .label('count'))\
                .group_by(Incident.category)\
                .order_by(desc('count'))
incident_categories = list()
print "{:<30} {:<20}".format("Category", "Count")
for category, num_incidents in query:
    incident_categories.append(category)
    print "{:<30} {:<20}".format(category, num_incidents)


Category                       Count               
LARCENY/THEFT                  35602               
OTHER OFFENSES                 15826               
NON-CRIMINAL                   15290               
ASSAULT                        10356               
WARRANTS                       6995                
DRUG/NARCOTIC                  6678                
VANDALISM                      6649                
VEHICLE THEFT                  6202                
BURGLARY                       5840                
MISSING PERSON                 4374                
ROBBERY                        4155                
SUSPICIOUS OCC                 3335                
FRAUD                          2567                
WEAPON LAWS                    1166                
TRESPASS                       989                 
DRUNKENNESS                    954                 
RECOVERED VEHICLE              787                 
FORGERY/COUNTERFEITING         706                 
SEX OFFENSES, FORCIBLE         563                 
KIDNAPPING                     512                 
DISORDERLY CONDUCT             460                 
DRIVING UNDER THE INFLUENCE    429                 
PROSTITUTION                   269                 
ARSON                          228                 
LIQUOR LAWS                    227                 
RUNAWAY                        219                 
EMBEZZLEMENT                   136                 
SUICIDE                        73                  
LOITERING                      52                  
FAMILY OFFENSES                35                  
EXTORTION                      30                  
BAD CHECKS                     24                  
GAMBLING                       21                  
STOLEN PROPERTY                12                  
SEX OFFENSES, NON FORCIBLE     11                  
PORNOGRAPHY/OBSCENE MAT        5                   
BRIBERY                        3                   

In [5]:
import csv
import numpy
from numpy import arange,array,ones
from pylab import plot,show
from scipy import stats
import matplotlib.pyplot as plt

# cities and their populations
cities = {u'buena vista park': 2821,
          u'north waterfront': 1904,
          u'forest hill': 3322,
          u'inner richmond': 17074,
          u'van ness - civic center': 11686,
          u'lakeside': 1751,
          u'mission bay': 4789,
          u'midtown terrace': 3742,
          u'eureka valley - dolores heights - castro': 12064,
          u'miraloma park': 3648,
          u'western addition': 11981,
          u'lake': 8673,
          u'marina': 10246,
          u'ingleside terrace': 2353,
          u'inner sunset': 37388,
          u'castro-upper market': 2631,
          u'silver terrace': 6781,
          u'st. francis wood': 1318,
          u'sea cliff': 829,
          u'telegraph hill': 6766,
          u'mission': 2874,
          u'': 1,
          u'russian hill': 12967,
          u'parnassus - ashbury': 5340,
          u'twin peaks': 3859,
          u'portola': 12962,
          u'mount davidson manor': 1702,
          u'ocean view': 6271,
          u'hayes valley': 4889,
          u'central sunset': 20577,
          u'presidio heights': 4091,
          u'lower pacific heights': 10747,
          u'forest hill extension': 3422,
          u'golden gate heights': 2469,
          u'westwood highlands': 1694,
          u'oceanview': 6271,
          u'ingleside heights': 6702,
          u'noe valley': 22009,
          u'outer parkside': 13857,
          u'south beach': 5648,
          u'central waterfront - dogpatch': 820,
          u'lakeshore': 1,
          u'diamond heights': 2153,
          u'haight-ashbury': 10523,
          u'nob hill': 17900,
          u'outer richmond': 16821,
          u'chinatown': 10775,
          u'yerba buena': 10,
          u'financial district': 9301,
          u'glen park': 6066,
          u'excelsior': 25999,
          u'downtown': 19178,
          u'forest knolls': 2889,
          u'west portal': 2763,
          u'westwood park': 1933,
          u'outer sunset': 74129,
          u'parkside': 19871,
          u'cow hollow': 7314,
          u'inner parkside': 5122,
          u'lone mountain': 7668,
          u'north panhandle': 10387,
          u'bernal heights': 24430,
          u'mission terrace': 1,
          u'west of twin peaks': 4961,
          u'south of market': 11130,
          u'jordan park - laurel heights': 4170,
          u'crocker amazon': 9790,
          u'potrero hill': 13725,
          u'ingleside': 9082,
          u'pacific heights': 10747,
          u'seacliff': 829,
          u'corona heights': 3488,
          u'outer mission': 9328,
          u'merced heights': 3852,
          u'visitacion valley': 23054,
          u'central richmond': 25617,
          u'sunnyside': 8279,
          u'bayview': 17716,
          u'north beach': 17675,
          u'little hollywood': 983}

#crimes is a dictionary mapping of crime categories to number of incidents
crimes = {}
# incidents is a dictioanry mapping of neighborhoods to average home index values and crimes
incidents = {}

# initialize crimes to have a dict entry for each category
for value in session.query(Incident.category).distinct():
    crimes[value[0]] = 0

reader = csv.reader(open('data/ZillowSFExcel.csv', 'rU'), delimiter=',', quotechar='"')
# assign the home index value and a copy of crimes to each neighborhood
for row in reader:
    try:
        incidents[unicode(row[0].lower())] = (int(row[1]), crimes.copy())
    except:
        pass

# for each crime report, tally up the corresponding crime category in the corresponding neighborhood
for instance in session.query(Incident):
    if instance.neighborhood.lower() not in incidents:
        temp = crimes.copy()
        temp[instance.category] += 1
        incidents[instance.neighborhood.lower()] = (-1, temp)
    else:
        temp = incidents[instance.neighborhood.lower()][1]
        temp[instance.category] += 1
        incidents[instance.neighborhood.lower()] = (incidents[instance.neighborhood.lower()][0], temp)
#print incidents
print "Generated incidents dictionary"


Generated incidents dictionary

In [5]:
# function that, given the list of crime categories, will pick out the crime rate for each neighborhood for that
# selection.
# threshold is used to filter out some of the outlier-ish crime rates
# returns a pair of arrays, home index values and crime rates
def get_plots(categories=None, threshold=0.2):
    if categories == None:
        categories = [cat for cat in crimes.keys()]
    indexes = []
    crime_rates = []
    for key, value in incidents.items():
        num_crimes = 0
        for cat in categories:
            num_crimes += value[1][cat]
        if key in cities:
            crime_rate = num_crimes / (cities[key] + 0.0)
        else:
            crime_rate = -1
        if crime_rate > threshold:
            crime_rate = -1
        if crime_rate > 0 and value[0] > 0:
            crime_rates.append(crime_rate)
            indexes.append(value[0])
    return indexes, crime_rates

# function that does a linear regression over the datapoints generated by the above function.
# returns the r_value, p_value, std_err, and line
def run_lin_reg(x=[],y=[]):
    xi = array(x)
    slope, intercept, r_value, p_value, std_err = stats.linregress(xi,y)
    line = slope*xi+intercept
    plot(xi,line,'r-',xi,y,'o')
    show()
    return r_value, p_value, std_err, line

In [6]:
x, y = get_plots([u'LARCENY/THEFT', u'ROBBERY', u'BURGLARY', u'STOLEN PROPERTY'])
r_value, p_value, std_err, line = run_lin_reg(x, y)

We run a linear regression on the "crime rate" of incidents of each category in each neighborhood, and then sort it to find the weights of each category.


In [7]:
# fit a linear regression model
from sklearn import linear_model
regression = linear_model.LinearRegression()
# here, X is a list of rows of neighborhoods
# where each row is the number of incidents for a certain category in that neighborhood
# and the order of the rows is defined by incident_categories
X = []
y = []
for neighborhood, data in incidents.iteritems():
    zillow_price, incidents_by_category = data
    population = cities[neighborhood]
    x = []
    for category in incident_categories:
        if category in incidents_by_category:
            x.append(incidents_by_category[category] / float(population))
        else:
            print neighborhood, " didn't have anything for category ", category
            x.append(0)
    X.append(x)
    y.append(zillow_price)
regression.fit(X, y)
print regression.score(X, y)
category_weights = zip(incident_categories, regression.coef_)
category_weights.sort(key=lambda x: x[1])
print "{:<30} {:<20}".format("Category", "Regression Coefficient")
for c, w in category_weights:
    print "{:<30} {:<20}".format(c, w)


0.677152393462
Category                       Regression Coefficient
GAMBLING                       -5339176575.58      
SUICIDE                        -3825622420.49      
STOLEN PROPERTY                -2976248465.04      
BRIBERY                        -2522638898.12      
KIDNAPPING                     -1835831649.2       
DRIVING UNDER THE INFLUENCE    -1443479981.08      
ARSON                          -927179614.149      
BAD CHECKS                     -395574948.676      
DRUNKENNESS                    -380565676.168      
RUNAWAY                        -346192496.549      
TRESPASS                       -283579932.279      
PROSTITUTION                   -247577290.229      
DRUG/NARCOTIC                  -231372798.142      
VANDALISM                      -228841307.802      
FRAUD                          -220288135.844      
ROBBERY                        -79460214.2987      
MISSING PERSON                 -65986084.4835      
NON-CRIMINAL                   -30598396.9938      
OTHER OFFENSES                 -3517395.27306      
RECOVERED VEHICLE              -2561253.35047      
LARCENY/THEFT                  1421310.17127       
BURGLARY                       1886535.89928       
ASSAULT                        2133600.1704        
VEHICLE THEFT                  139009956.714       
WEAPON LAWS                    165331104.899       
WARRANTS                       313079471.602       
SUSPICIOUS OCC                 403264989.298       
SEX OFFENSES, FORCIBLE         464646007.968       
DISORDERLY CONDUCT             727752949.94        
FORGERY/COUNTERFEITING         1436530022.35       
PORNOGRAPHY/OBSCENE MAT        1669695359.04       
EMBEZZLEMENT                   1690590014.9        
LIQUOR LAWS                    3025841257.94       
FAMILY OFFENSES                3170422801.23       
LOITERING                      3203002388.39       
EXTORTION                      4257995814.63       
SEX OFFENSES, NON FORCIBLE     6306415962.45       

In [8]:
import csv
from sets import Set

with open(DATA_DIR + 'ZillowSFExcel.csv', 'rU') as csvfile:
    zillowreader = csv.reader(csvfile, delimiter=',')
    
    dictsetter = []
    zillowreader.next()
    for row in zillowreader:
        dictsetter.append(row)

housingdict = {}        
for i in range(1, len(dictsetter)):
    housingdict[dictsetter[i][0]] = float(dictsetter[i][1])


with open(DATA_DIR  + 'sfpd_incidents_2013_with_neighborhoods.csv', 'rU') as csvfile:
    neighborhoodreader = csv.reader(csvfile, delimiter=',')
    
    neighborhoodset = Set([])
    for row in neighborhoodreader:
        neighborhoodset.add(row[9])
        
neighborhoodset.remove('')        
neighborhoodlist = list(neighborhoodset)


-c:2: DeprecationWarning: the sets module is deprecated

In [9]:
from pylab import *



boxplot(housingdict.values())
show()

In [19]:
# this cell generates some json for the map page
# the json is in the format:
# [{neighborhood_name:
#     'population': population_number,
#     'home_value_index': Zillow home value index,
#     'incident_counts': {
#          'bribery': 3,
#          'larceny': 5,...
#      }
#  },
#  {...}
# ]
from sqlalchemy import func, desc
import json

json_obj = {}
session = Session()
query = session.query(Incident.neighborhood,
                      Incident.category,
                      func.count(Incident.category).label('count'))\
                .filter(Incident.neighborhood != "")\
                .group_by(Incident.neighborhood, Incident.category)\
                .order_by(Incident.neighborhood, Incident.category, desc('count'))
for neighborhood, category, num_incidents in query:
    if neighborhood not in json_obj:
        home_value_index = incidents[neighborhood.lower()][0]
        if home_value_index == -1:
            home_value_index = None
        json_obj[neighborhood] = {'population': cities[neighborhood.lower()],
                                  'home_value_index': home_value_index,
                                  'incident_counts': dict()}
    json_obj[neighborhood]['incident_counts'][category] = num_incidents
json.dumps(json_obj)

with open(os.path.join(DATA_DIR, 'sf-incidents.json'), 'w') as f:
    json.dump(json_obj, f, indent=4, sort_keys=True)
print "Dumped JSON to file"


Dumped JSON to file

In [ ]: