In [1]:
import pandas as pd
import re
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [16]:
df = pd.read_csv('./raw.csv')
df = df.drop('Unnamed: 0', axis=1)

In [17]:
def remove_extra_spaces(string):
    string = re.sub( '-', ' ', string ).strip()
    string = re.sub( '\s+', ' ', string ).strip()
    return string

df['Incident'] = df['Incident'].map(remove_extra_spaces)
df['Address'] = df['Address'].map(remove_extra_spaces)

In [18]:
def add_three(cat, desc=None):
    cats[cat] = [ cat, 0]
    cats[cat + ' 1ST DEGREE'] = [cat, 1]
    cats[cat + ' 2ND DEGREE'] = [cat, 2]
    cats[cat + ' 3RD DEGREE'] = [cat, 3]
    cats[cat + ' 1ST'] = [cat, 1]
    cats[cat + ' 2ND'] = [cat, 2]
    cats[cat + ' 3RD'] = [cat, 3]
    cats[cat + ' 1'] = [cat, 1]
    cats[cat + ' 2'] = [cat, 2]
    cats[cat + ' 3'] = [cat, 3]
    
cats = {
 'DRIVING UNDER INFLUENCE ALCOHOL': 
     ['DRIVING UNDER INFLUENCE', None],
 'DRIVING UNDER INFLUENCE DRUGS': 
     ['DRIVING UNDER INFLUENCE', None],
 'DRIVING UNDER THE INFLUENCE (ALCOHOL)': 
     ['DRIVING UNDER INFLUENCE', None],
 'DV 3RD HARASSMENT FAMILY': 
     ['DOMESTIC VIOLENCE', 3],
 'DV 3RD SIMPLE ASSAULT FAMILY': 
     ['DOMESTIC VIOLENCE', 3],
 'DV 3RD CRIMINAL MISCHIEF 3RD': 
     ['DOMESTIC VIOLENCE', 3],
 'DV 3RD HARASSING COMMUNICATIONS': 
     ['DOMESTIC VIOLENCE', 3],
 'HARASSMENT': 
     ['HARASSMENT', None],
 'HARASSING COMMUNICATIONS': 
     ['HARASSMENT', None],
 'ILLEGAL POSS/USE CREDIT CARD': 
     ['ILLEGAL POSS/USE CREDIT CARD', None],
 'UNLAWFUL BREAKING AND ENTERING A VEHCILE': 
     ['BREAKING AND ENTERING A VEHICLE', None],
 'UNLAWFUL BREAKING AND ENTERING A VEHICLE': 
     ['BREAKING AND ENTERING A VEHICLE', None],
 'BREAKING AND ENTERING A VEHICLE': 
     ['BREAKING AND ENTERING A VEHICLE', None],
 'UNLAWFUL BREAKING AND ENTERING A VEHICLE (NO THEFT)': 
     ['BREAKING AND ENTERING A VEHICLE', None],
 'UNAUTHORIZED USE OF AUTO': 
     ['UNAUTHORIZED USE', None],
 'UNAUTHORIZED USE OTHER VEHICLE': 
     ['UNAUTHORIZED USE', None],
 'DOMESTIC VIOLENCE BY STRANGULATION OR SUFFOCATION INJURY': 
     ['DOMESTIC VIOLENCE', None],
 'DOMESTIC VIOLENCE BY STRANGULATION OR SUFFOCATION': 
     ['DOMESTIC VIOLENCE', None],
 'DRIVING UNDER THE INFLUENCE (UNDER 21 YEARS)':
     ['DRIVING UNDER INFLUENCE', None],
 'DRIVING UNDER THE INFLUENCE (ALC. & CON. SUBS.)':
     ['DRIVING UNDER INFLUENCE', None],
 'CRUELTY TO ANIMALS':
     ['ANIMAL', None],
 'HARBORING VICIOUS OR DANGEROUS ANIMAL':
     ['ANIMAL', None],
 'ASSAULT':
     ['ASSAULT', None],
 'SIMPLE ASSAULT':
     ['ASSAULT', None],
 'CARRYING CONCEALED WEAPON':
     ['WEAPON', None],
 'FIREARMS LICENSE REQUIRED':
     ['WEAPON', None],
 'LICENSE TO CARRY PISTOL':
     ['WEAPON', None],
 'ALTER FIREARM ID NUMBER':
     ['WEAPON', None],
 'BRASS KNUCKLES':
     ['WEAPON', None],
 
 }

threes = [
  'ARSON',
  'AUTO THEFT', 
  'BREAKING AND ENTERING VEHICLE',
  'BURGLARY', 
  'CRIMINAL MISCHIEF',
  'CRIMINAL TAMPERING', 
  'CRIMINAL TRESPASS',
  'DOMESTIC VIOLENCE',
  'ELDER ABUSE AND NEGLECT',
  'ESCAPE',
  'EXTORTION',
  'FORGERY',
  'FINANCIAL EXPLOITATION OF ELDERLY PERSON',
  'IDENTITY THEFT',
  'RECEIVING STOLEN PROPERTY',
  'ROBBERY',
  'THEFT',
  'THEFT OF OTHER VEHICLES',
  'THEFT OF LOST PROPERTY',
  'THEFT OF SERVICES',
  'POSS. FORGED INSTRUMENT',
  'PROMOTING CONTRABAND',
  'STALKING',
  'HINDERING PROSECUTION',
  'PROMOTING PROSTITUTION',
  'PERJURY',
  'THEFT & SALE',
]

for three in threes:
    add_three(three)

others_deg = { 
  'AGGRAVATED ASSAULT': 
      ['ASSAULT', 'AGGRAVATED, '],
  'ROBBERY': 
      ['ROBBERY', ''],
  'DV': 
      ['DOMESTIC VIOLENCE', ''],
  'EXTORTION': 
      ['EXTORTION', ''],
  'ATTEMPTED BURGLARY': 
      ['ATTEMPTED BUGLARY', ''],
}

others = [
  'DISCHARGING FIREARM',
  'DUTY UPON STRIKING',
  'MURDER',
  'ADULT SEX OFFENDER',
  'ANIMAL',
  'ATTEMPTED MURDER',
  ]
    
def fix(string):
    if cats.has_key(string):
        return cats[string]
        
    for (k,v) in others_deg.iteritems():
        for (i, s) in enumerate(['1ST', '2ND', '3RD']):
            if string.find(k + ' ' + s) == 0:
                return [v[0], i]
    
    for k in others:
        if string.find(k) == 0:
            return [k, None]
    return ['OTHER', None]

In [19]:
new_data = pd.DataFrame(list(df['Incident'].map(fix)), columns=['cat', 'deg'])
new_data = df.join(new_data)
new_data.groupby('cat').count()

#d = new_data[pd.isnull(new_data['cat'])]
#grouped = d.groupby('Incident')
#c = grouped.count()
#v = c.sort('Address', ascending=False)['Incident']
#print v.to_string()


Out[19]:
Address Case DateTime File Incident NumInc Shift cat deg
cat
ADULT SEX OFFENDER 7 7 7 7 7 7 7 7 0
ANIMAL 67 67 67 67 67 67 67 67 0
ARSON 5 5 5 5 5 5 5 5 5
ASSAULT 270 270 270 270 270 270 270 270 86
ATTEMPTED BUGLARY 13 13 13 13 13 13 13 13 13
ATTEMPTED MURDER 7 7 7 7 7 7 7 7 0
AUTO THEFT 161 161 161 161 161 161 161 161 161
BREAKING AND ENTERING A VEHICLE 1502 1502 1502 1502 1502 1502 1502 1502 0
BREAKING AND ENTERING VEHICLE 2 2 2 2 2 2 2 2 2
BURGLARY 1125 1125 1125 1125 1125 1125 1125 1125 1125
CRIMINAL MISCHIEF 1158 1158 1158 1158 1158 1158 1158 1158 1158
CRIMINAL TAMPERING 40 40 40 40 40 40 40 40 40
CRIMINAL TRESPASS 173 173 173 173 173 173 173 173 173
DISCHARGING FIREARM 28 28 28 28 28 28 28 28 0
DOMESTIC VIOLENCE 1864 1864 1864 1864 1864 1864 1864 1864 1813
DRIVING UNDER INFLUENCE 28 28 28 28 28 28 28 28 0
DUTY UPON STRIKING 197 197 197 197 197 197 197 197 0
ELDER ABUSE AND NEGLECT 3 3 3 3 3 3 3 3 3
ESCAPE 10 10 10 10 10 10 10 10 10
EXTORTION 2 2 2 2 2 2 2 2 2
FINANCIAL EXPLOITATION OF ELDERLY PERSON 4 4 4 4 4 4 4 4 4
FORGERY 94 94 94 94 94 94 94 94 94
HARASSMENT 1296 1296 1296 1296 1296 1296 1296 1296 0
HINDERING PROSECUTION 7 7 7 7 7 7 7 7 7
IDENTITY THEFT 445 445 445 445 445 445 445 445 445
ILLEGAL POSS/USE CREDIT CARD 224 224 224 224 224 224 224 224 0
MURDER 5 5 5 5 5 5 5 5 0
OTHER 1780 1780 1780 1780 1780 1780 1780 1780 0
PERJURY 1 1 1 1 1 1 1 1 1
POSS. FORGED INSTRUMENT 155 155 155 155 155 155 155 155 155
PROMOTING CONTRABAND 12 12 12 12 12 12 12 12 12
PROMOTING PROSTITUTION 3 3 3 3 3 3 3 3 3
RECEIVING STOLEN PROPERTY 97 97 97 97 97 97 97 97 97
ROBBERY 144 144 144 144 144 144 144 144 144
STALKING 12 12 12 12 12 12 12 12 12
THEFT 4637 4637 4637 4637 4637 4637 4637 4637 4637
THEFT & SALE 1 1 1 1 1 1 1 1 1
THEFT OF LOST PROPERTY 139 139 139 139 139 139 139 139 139
THEFT OF OTHER VEHICLES 1 1 1 1 1 1 1 1 1
THEFT OF SERVICES 113 113 113 113 113 113 113 113 113
UNAUTHORIZED USE 118 118 118 118 118 118 118 118 0
WEAPON 87 87 87 87 87 87 87 87 0

42 rows × 9 columns


In [20]:
def fix(string):
    string = re.sub( 'Madison$', '', string ).strip()
    string = re.sub( 'Av$', 'Avenue', string ).strip()
    string = re.sub( 'Blvd$', 'Boulevard', string).strip()
    string = re.sub( 'Ln$', 'Lane', string ).strip()
    string = re.sub( 'Dr$', 'Drive', string ).strip()
    string = re.sub( 'Ct$', 'Court', string ).strip()
    string = re.sub( 'Cir$', 'Circle', string ).strip()
    string = re.sub( 'Rd$', 'Road', string ).strip()
    string = re.sub( 'Hwy$', 'Highway', string ).strip()
    string = re.sub( '72 W$', '72 West', string ).strip()
    string = re.sub( 'St$', 'Street', string).strip()
    string = re.sub( 'Tr$', 'Trail', string).strip()
    string = re.sub( 'Pl$', 'Place', string).strip()
    return string

new_data['Address'] = pd.DataFrame(list(new_data['Address'].map(fix)))

In [21]:
new_data.to_csv('clean.csv')

In [14]:
from geopy.geocoders import GoogleV3
import time
geolocator = GoogleV3(api_key="AIzaSyCtnKL4GRaUynWx2kuXaM0eVGpRObv-ZLs")

def geocode(string):
    print string
    location = geolocator.geocode(string + " Madison, AL")
    time.sleep(0.2)
    if location is None:
        return (None, None, None, None)
    if type(location) is tuple:
        location = location[0]
    return (location.address, location.latitude, location.longitude, location.raw)

In [64]:
addresses = new_data.groupby('Address')
for (k,v) in addresses.groups.iteritems():
    if not gc_results.has_key(k):
        gc_results[k] = geocode(k)


1000 Block of Nolan Court
Media Solutions, 9632 Madison Boulevard
The Heritage Station Homeowners Assoc, 116 Bayberry Lane
1000 Block of Waters Edge Lane
Board Of Education, 211 Celtic Drive
area of Chestnut Dr / Wall Triana Highway
Lowes Home Improvement Warehouse, 7920 Highway 72 West
area of Stella Drive
100 Block of Dupont Circle
City Of Madison, 5100 Wall Triana Highway
Halsey Food Service, 401 Lanier Road
Hogan Family Ymca, 130 Park Square Lane
City Of Madison, Hughes Rd / Will Halsey Way
Madison Police Department, Shelton Rd / Summerview Drive
600 Block of Larry Drive
area of Slaughter Road
600 Block of Browns Ferry Road
100 Block of Carlisle Circle
100 Block of Cambridge Circle
100 Block of Durham Drive
State Of Alabama, Madison Blvd / Wall Triana Highway
State Of Alabama, 8716 Madison Boulevard
Cb&S Bank, 8018 Highway 72 West
200 Block of Chantilly Lane
State Of Alabama, Bridgefield Rd / Wall Triana Highway
Madison City Board Of Education, 11306 County Line Road
American Overhead Door, 9091 Madison Boulevard
100 Block of Liza Lane
City Of Madison, Wall Triana Hwy @ I 565
St John Baptist Catholic Church, 1055 Hughes Road
area of Madison Blvd Madison / Wall Triana Highway
200 Block of Carriage Hill
100 Block of Lanthorn Circle
300 Block of Martin Street
Reeds Cable, 8000 Old Madison Pike
100 Block of Carlton Woods Drive
100 Block of Emerson Road
100 Block of Foxfield Drive
200 Block of Kyser Boulevard
City Of Madison, 151 Stone Meadow Lane
City Of Madison, Pebble Brook Cir / Wall Triana Highway
Buds Convenience Store, 101 Kyser Boulevard
City Of Madison, Gillespie Rd / Wall Triana Highway
100 Block of Autumn Haven Lane
200 Block of Shelton Road
900 Block of Portal Lane
State Of Alabama, 1550 Hughes Road
area of Rocket Harley Davidson
area of Green Creek Road
100 Block of Reinhart Circle
City Of Madison, 137 Manningham Drive
300 Block of Autumn Lane
City Of Huntsville, 138 Teal Park Lane
200 Block of Waterson Way
200 Block of Park Stone Drive
City Of Madison, 8781 Madison Boulevard
The Station Pub And Grill, 8694 Madison Boulevard
800 Block of Ashwood Drive
K`S Beauty Supply, 7837 Highway 72 West
Cd`S Pub And Grill, 107 Arlington Drive
100 Block of Liverpool Drive
area of Angela Drive
City Of Madison, 118 Ashville Wood Court
Hunters Chase Apartments, 340 Gillespie Road
City Of Madison, Bowers Rd / Powell Road
Madison City Schools, 11306 County Line Road
200 Block of Moore Street
200 Block of Front Street
City Of Madison, 145 Angela Drive
Neurovizion, 3490 Wall Triana Highway
100 Block of Vinewood Lane
area of Bradford Farms Subdivision
City Of Madison, 100 Park Square Lane
100 Block of Conger Road
State Of Alabama, 244 Kyser Boulevard
100 Block of Chadrick Drive
100 Block of Kensington Drive
Rbc Centura Bank, 7930 Highway 72 West
---------------------------------------------------------------------------
GeocoderQuotaExceeded                     Traceback (most recent call last)
<ipython-input-64-29d5ce5bd3db> in <module>()
      2 for (k,v) in addresses.groups.iteritems():
      3     if not gc_results.has_key(k):
----> 4         gc_results[k] = geocode(k)

<ipython-input-28-fe0dbede2a25> in geocode(string)
      5 def geocode(string):
      6     print string
----> 7     location = geolocator.geocode(string + " Madison, AL")
      8     time.sleep(0.2)
      9     if location is None:

/usr/local/lib/python2.7/dist-packages/geopy/geocoders/googlev3.pyc in geocode(self, query, exactly_one, timeout, bounds, region, components, language, sensor)
    214         logger.debug("%s.geocode: %s", self.__class__.__name__, url)
    215         return self._parse_json(
--> 216             self._call_geocoder(url, timeout=timeout), exactly_one
    217         )
    218 

/usr/local/lib/python2.7/dist-packages/geopy/geocoders/googlev3.pyc in _parse_json(self, page, exactly_one)
    335         places = page.get('results', [])
    336         if not len(places):
--> 337             self._check_status(page.get('status'))
    338             return None
    339 

/usr/local/lib/python2.7/dist-packages/geopy/geocoders/googlev3.pyc in _check_status(status)
    360         if status == 'OVER_QUERY_LIMIT':
    361             raise GeocoderQuotaExceeded(
--> 362                 'The given key has gone over the requests limit in the 24'
    363                 ' hour period or has submitted too many requests in too'
    364                 ' short a period of time.'

GeocoderQuotaExceeded: The given key has gone over the requests limit in the 24 hour period or has submitted too many requests in too short a period of time.


In [22]:
geocode_df = pd.DataFrame(gc_results).transpose()
geocode_df.columns = ['address', 'lat', 'lon', 'raw']
geocode_df.to_csv('geocode.csv')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-22-93778e16b17c> in <module>()
----> 1 geocode_df = pd.DataFrame(gc_results).transpose()
      2 geocode_df.columns = ['address', 'lat', 'lon', 'raw']
      3 geocode_df.to_csv('geocode.csv')

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    199                                  dtype=dtype, copy=copy)
    200         elif isinstance(data, dict):
--> 201             mgr = self._init_dict(data, index, columns, dtype=dtype)
    202         elif isinstance(data, ma.MaskedArray):
    203             import numpy.ma.mrecords as mrecords

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    321 
    322         return _arrays_to_mgr(arrays, data_names, index, columns,
--> 323                               dtype=dtype)
    324 
    325     def _init_ndarray(self, values, index, columns, dtype=None,

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   4461     # figure out the index, if necessary
   4462     if index is None:
-> 4463         index = extract_index(arrays)
   4464     else:
   4465         index = _ensure_index(index)

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in extract_index(data)
   4512 
   4513             if have_dicts:
-> 4514                 raise ValueError('Mixing dicts with non-Series may lead to '
   4515                                  'ambiguous ordering.')
   4516 

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.

In [112]:
geocode_df = pd.DataFrame.from_csv('geocode.csv')
geocode_df.ix[ geocode_df.lat < 34.484434, ['lat', 'lon']] = nan
geocode_df.ix[ geocode_df.lat > 34.8950, ['lat', 'lon']] = nan
geocode_df.ix[ geocode_df.lon > -86.4159, ['lat', 'lon']] = nan
geocode_df.ix[ geocode_df.lon < -86.984023, ['lat', 'lon']] = nan
geocode_dict = geocode_df.to_dict('dict')

In [114]:
new_data['Lat'] = new_data['Address'].apply(geocode_dict['lat'].get)
new_data['Lon'] = new_data['Address'].apply(geocode_dict['lon'].get)
new_data['LatLon'] = new_data['Address'].apply(geocode_dict['lat'].get).map(str) + ", " + new_data['Address'].apply(geocode_dict['lon'].get).map(str)
new_data.to_csv('new_data_gc.csv')

In [ ]: