In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('/home/keer/DSSG/data-challenges/BuildingInspections/data/Building_Violations_sample_50000.csv')
import re
def spaces_to_snake(column_name):
"""
converts a string that has spaces into snake_case
Example:
print camel_to_snake("KENNY BROUGHT HIS WIFE")
> KENNY_BROUGHT_HIS_WIFE
To see how to apply this to camel case, see:
http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
"""
s = re.sub(r"\s+", '_', column_name)
return s.lower()
df.columns = [spaces_to_snake(col) for col in df.columns]
df.columns
Out[3]:
In [8]:
inspection_frequency=df[['location','id']].groupby('location').count()
In [14]:
violation_frequency=df[['location','id']][df['inspection_status']=='FAILED'].groupby(['location']).count()
In [83]:
dummy=df[['location','violation_code','id']][df['inspection_status']=='FAILED'].groupby(['location','violation_code']).count()
dummy['id']=1
violation_type_count=dummy.reset_index()[['location','id']].groupby('location').count().reset_index()
violation_type_count.columns=['locations','unique_violation_codes']
violation_type_count.head(25)
Out[83]:
In [103]:
%matplotlib inline
violation_type_dist=violation_type_count.copy()
violation_type_dist['locations']=1
violation_type_dist=violation_type_dist.groupby('unique_violation_codes').count().reset_index()
violation_type_dist.columns=['unique_violation_codes','number_of_buildings']
violation_type_dist.head()
Out[103]:
In [ ]:
In [ ]: