In [1]:
import pandas as pd
import re
%pylab inline
In [16]:
df = pd.read_csv('./raw.csv')
df = df.drop('Unnamed: 0', axis=1)
In [17]:
def remove_extra_spaces(string):
string = re.sub( '-', ' ', string ).strip()
string = re.sub( '\s+', ' ', string ).strip()
return string
df['Incident'] = df['Incident'].map(remove_extra_spaces)
df['Address'] = df['Address'].map(remove_extra_spaces)
In [18]:
def add_three(cat, desc=None):
cats[cat] = [ cat, 0]
cats[cat + ' 1ST DEGREE'] = [cat, 1]
cats[cat + ' 2ND DEGREE'] = [cat, 2]
cats[cat + ' 3RD DEGREE'] = [cat, 3]
cats[cat + ' 1ST'] = [cat, 1]
cats[cat + ' 2ND'] = [cat, 2]
cats[cat + ' 3RD'] = [cat, 3]
cats[cat + ' 1'] = [cat, 1]
cats[cat + ' 2'] = [cat, 2]
cats[cat + ' 3'] = [cat, 3]
cats = {
'DRIVING UNDER INFLUENCE ALCOHOL':
['DRIVING UNDER INFLUENCE', None],
'DRIVING UNDER INFLUENCE DRUGS':
['DRIVING UNDER INFLUENCE', None],
'DRIVING UNDER THE INFLUENCE (ALCOHOL)':
['DRIVING UNDER INFLUENCE', None],
'DV 3RD HARASSMENT FAMILY':
['DOMESTIC VIOLENCE', 3],
'DV 3RD SIMPLE ASSAULT FAMILY':
['DOMESTIC VIOLENCE', 3],
'DV 3RD CRIMINAL MISCHIEF 3RD':
['DOMESTIC VIOLENCE', 3],
'DV 3RD HARASSING COMMUNICATIONS':
['DOMESTIC VIOLENCE', 3],
'HARASSMENT':
['HARASSMENT', None],
'HARASSING COMMUNICATIONS':
['HARASSMENT', None],
'ILLEGAL POSS/USE CREDIT CARD':
['ILLEGAL POSS/USE CREDIT CARD', None],
'UNLAWFUL BREAKING AND ENTERING A VEHCILE':
['BREAKING AND ENTERING A VEHICLE', None],
'UNLAWFUL BREAKING AND ENTERING A VEHICLE':
['BREAKING AND ENTERING A VEHICLE', None],
'BREAKING AND ENTERING A VEHICLE':
['BREAKING AND ENTERING A VEHICLE', None],
'UNLAWFUL BREAKING AND ENTERING A VEHICLE (NO THEFT)':
['BREAKING AND ENTERING A VEHICLE', None],
'UNAUTHORIZED USE OF AUTO':
['UNAUTHORIZED USE', None],
'UNAUTHORIZED USE OTHER VEHICLE':
['UNAUTHORIZED USE', None],
'DOMESTIC VIOLENCE BY STRANGULATION OR SUFFOCATION INJURY':
['DOMESTIC VIOLENCE', None],
'DOMESTIC VIOLENCE BY STRANGULATION OR SUFFOCATION':
['DOMESTIC VIOLENCE', None],
'DRIVING UNDER THE INFLUENCE (UNDER 21 YEARS)':
['DRIVING UNDER INFLUENCE', None],
'DRIVING UNDER THE INFLUENCE (ALC. & CON. SUBS.)':
['DRIVING UNDER INFLUENCE', None],
'CRUELTY TO ANIMALS':
['ANIMAL', None],
'HARBORING VICIOUS OR DANGEROUS ANIMAL':
['ANIMAL', None],
'ASSAULT':
['ASSAULT', None],
'SIMPLE ASSAULT':
['ASSAULT', None],
'CARRYING CONCEALED WEAPON':
['WEAPON', None],
'FIREARMS LICENSE REQUIRED':
['WEAPON', None],
'LICENSE TO CARRY PISTOL':
['WEAPON', None],
'ALTER FIREARM ID NUMBER':
['WEAPON', None],
'BRASS KNUCKLES':
['WEAPON', None],
}
threes = [
'ARSON',
'AUTO THEFT',
'BREAKING AND ENTERING VEHICLE',
'BURGLARY',
'CRIMINAL MISCHIEF',
'CRIMINAL TAMPERING',
'CRIMINAL TRESPASS',
'DOMESTIC VIOLENCE',
'ELDER ABUSE AND NEGLECT',
'ESCAPE',
'EXTORTION',
'FORGERY',
'FINANCIAL EXPLOITATION OF ELDERLY PERSON',
'IDENTITY THEFT',
'RECEIVING STOLEN PROPERTY',
'ROBBERY',
'THEFT',
'THEFT OF OTHER VEHICLES',
'THEFT OF LOST PROPERTY',
'THEFT OF SERVICES',
'POSS. FORGED INSTRUMENT',
'PROMOTING CONTRABAND',
'STALKING',
'HINDERING PROSECUTION',
'PROMOTING PROSTITUTION',
'PERJURY',
'THEFT & SALE',
]
for three in threes:
add_three(three)
others_deg = {
'AGGRAVATED ASSAULT':
['ASSAULT', 'AGGRAVATED, '],
'ROBBERY':
['ROBBERY', ''],
'DV':
['DOMESTIC VIOLENCE', ''],
'EXTORTION':
['EXTORTION', ''],
'ATTEMPTED BURGLARY':
['ATTEMPTED BUGLARY', ''],
}
others = [
'DISCHARGING FIREARM',
'DUTY UPON STRIKING',
'MURDER',
'ADULT SEX OFFENDER',
'ANIMAL',
'ATTEMPTED MURDER',
]
def fix(string):
if cats.has_key(string):
return cats[string]
for (k,v) in others_deg.iteritems():
for (i, s) in enumerate(['1ST', '2ND', '3RD']):
if string.find(k + ' ' + s) == 0:
return [v[0], i]
for k in others:
if string.find(k) == 0:
return [k, None]
return ['OTHER', None]
In [19]:
new_data = pd.DataFrame(list(df['Incident'].map(fix)), columns=['cat', 'deg'])
new_data = df.join(new_data)
new_data.groupby('cat').count()
#d = new_data[pd.isnull(new_data['cat'])]
#grouped = d.groupby('Incident')
#c = grouped.count()
#v = c.sort('Address', ascending=False)['Incident']
#print v.to_string()
Out[19]:
In [20]:
def fix(string):
string = re.sub( 'Madison$', '', string ).strip()
string = re.sub( 'Av$', 'Avenue', string ).strip()
string = re.sub( 'Blvd$', 'Boulevard', string).strip()
string = re.sub( 'Ln$', 'Lane', string ).strip()
string = re.sub( 'Dr$', 'Drive', string ).strip()
string = re.sub( 'Ct$', 'Court', string ).strip()
string = re.sub( 'Cir$', 'Circle', string ).strip()
string = re.sub( 'Rd$', 'Road', string ).strip()
string = re.sub( 'Hwy$', 'Highway', string ).strip()
string = re.sub( '72 W$', '72 West', string ).strip()
string = re.sub( 'St$', 'Street', string).strip()
string = re.sub( 'Tr$', 'Trail', string).strip()
string = re.sub( 'Pl$', 'Place', string).strip()
return string
new_data['Address'] = pd.DataFrame(list(new_data['Address'].map(fix)))
In [21]:
new_data.to_csv('clean.csv')
In [14]:
from geopy.geocoders import GoogleV3
import time
geolocator = GoogleV3(api_key="AIzaSyCtnKL4GRaUynWx2kuXaM0eVGpRObv-ZLs")
def geocode(string):
print string
location = geolocator.geocode(string + " Madison, AL")
time.sleep(0.2)
if location is None:
return (None, None, None, None)
if type(location) is tuple:
location = location[0]
return (location.address, location.latitude, location.longitude, location.raw)
In [64]:
addresses = new_data.groupby('Address')
for (k,v) in addresses.groups.iteritems():
if not gc_results.has_key(k):
gc_results[k] = geocode(k)
In [22]:
geocode_df = pd.DataFrame(gc_results).transpose()
geocode_df.columns = ['address', 'lat', 'lon', 'raw']
geocode_df.to_csv('geocode.csv')
In [112]:
geocode_df = pd.DataFrame.from_csv('geocode.csv')
geocode_df.ix[ geocode_df.lat < 34.484434, ['lat', 'lon']] = nan
geocode_df.ix[ geocode_df.lat > 34.8950, ['lat', 'lon']] = nan
geocode_df.ix[ geocode_df.lon > -86.4159, ['lat', 'lon']] = nan
geocode_df.ix[ geocode_df.lon < -86.984023, ['lat', 'lon']] = nan
geocode_dict = geocode_df.to_dict('dict')
In [114]:
new_data['Lat'] = new_data['Address'].apply(geocode_dict['lat'].get)
new_data['Lon'] = new_data['Address'].apply(geocode_dict['lon'].get)
new_data['LatLon'] = new_data['Address'].apply(geocode_dict['lat'].get).map(str) + ", " + new_data['Address'].apply(geocode_dict['lon'].get).map(str)
new_data.to_csv('new_data_gc.csv')
In [ ]: