In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('/home/keer/DSSG/data-challenges/BuildingInspections/data/Building_Violations_sample_50000.csv')

import re

def spaces_to_snake(column_name):
    """
    converts a string that has spaces into snake_case
    Example:
        print camel_to_snake("KENNY BROUGHT HIS WIFE")
        > KENNY_BROUGHT_HIS_WIFE
    To see how to apply this to camel case, see:
        http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
    """
    s = re.sub(r"\s+", '_', column_name)
    return s.lower()

df.columns = [spaces_to_snake(col) for col in df.columns]


df.columns


Out[3]:
Index([u'unnamed:_0', u'id', u'violation_last_modified_date', u'violation_date', u'violation_code', u'violation_status', u'violation_status_date', u'violation_description', u'violation_location', u'violation_inspector_comments', u'violation_ordinance', u'inspector_id', u'inspection_number', u'inspection_status', u'inspection_waived', u'inspection_category', u'department_bureau', u'address', u'property_group', u'ssa', u'latitude', u'longitude', u'location'], dtype='object')

In [8]:
inspection_frequency=df[['location','id']].groupby('location').count()

In [14]:
violation_frequency=df[['location','id']][df['inspection_status']=='FAILED'].groupby(['location']).count()

In [83]:
dummy=df[['location','violation_code','id']][df['inspection_status']=='FAILED'].groupby(['location','violation_code']).count()
dummy['id']=1
violation_type_count=dummy.reset_index()[['location','id']].groupby('location').count().reset_index()
violation_type_count.columns=['locations','unique_violation_codes']
violation_type_count.head(25)


Out[83]:
locations unique_violation_codes
0 (41.64692911211202,-87.61447966111751) 1
1 (41.647795874157346,-87.54018148507018) 2
2 (41.64781346219875,-87.61421661741555) 1
3 (41.6478148942479,-87.61645925569275) 1
4 (41.648032636313566,-87.61392876189757) 2
5 (41.648033312120475,-87.61449814827196) 1
6 (41.64803352865298,-87.61558610924808) 1
7 (41.64831133789253,-87.54450422674195) 1
8 (41.64868870411254,-87.54488161972836) 1
9 (41.64892413385787,-87.54628496618952) 1
10 (41.649007538483055,-87.55916462709274) 1
11 (41.64919842993987,-87.54578457882039) 1
12 (41.649496000274205,-87.5385111493897) 1
13 (41.64954707623299,-87.547037252503) 1
14 (41.64968175814829,-87.54578298947494) 1
15 (41.64988698229695,-87.54607483071594) 1
16 (41.64993012181813,-87.54701738845729) 2
17 (41.650343384158205,-87.54607282205943) 1
18 (41.65054094697297,-87.5482607057881) 1
19 (41.65062179173726,-87.54095925939012) 1
20 (41.6507152932624,-87.54824755540024) 1
21 (41.65074076438346,-87.54607322537252) 1
22 (41.65079720427416,-87.54578041438954) 1
23 (41.65091265199324,-87.54701434883746) 1
24 (41.65096740821523,-87.54578010931255) 1

In [103]:
%matplotlib inline

violation_type_dist=violation_type_count.copy()
violation_type_dist['locations']=1
violation_type_dist=violation_type_dist.groupby('unique_violation_codes').count().reset_index()
violation_type_dist.columns=['unique_violation_codes','number_of_buildings']
violation_type_dist.head()


Out[103]:
unique_violation_codes number_of_buildings
0 1 20145
1 2 4921
2 3 1347
3 4 482
4 5 181

In [ ]:


In [ ]: