Data Analysis with Python: Pandas Data Aggregation and Group Operations, by Wes McKinney
Based on an assignment by JT Wolohan
Hate Crimes Dataset from U.S. Department of Justice and FBI [Table 13]
State               Str   State which reported the data
Agency type         Str   Type of agency reporting the data
Religion            Int   Count of hate crimes motivated by religion
Sexual orientation  Int   Count of hate crimes motivated by sexual orientation
Ethnicity           Int   Count of hate crimes motivated by ethnicity
Disability          Int   Count of hate crimes motivated by disibility
Gender              Int   Count of hate crimes motivated by gender
Gender identity     Int   Count of hate crimes motivated by gender identity
1Q                  Int   Count of hate crimes reported in Q1
2Q                  Int   Count of hate crimes repotred in Q2
3Q                  Int   Count of hate crimes reported in Q3
4Q                  Int   Count of hate crimes reported in Q4
Population          Int   Population of area for which reporting agency is responsible
hate_crime_2013.csv
hate_crime_2013.json
hate_crime_2013.db
In [1]:
    
import pandas as pd
import sqlite3
    
In [2]:
    
df1 = pd.read_csv('hate_crimes_2013.csv')
df1['ID'] = df1.index
df1.head()
    
    Out[2]:
In [3]:
    
df2 = pd.read_json('hate_crimes_2013.json')
df2['ID'] = df2.index
df2.head()
    
    Out[3]:
In [4]:
    
df3 = pd.merge(df1, df2, how='left', left_on='ID', right_on='ID')
df3.head()
    
    Out[4]:
In [5]:
    
# Create sqlite connection
cnx = sqlite3.connect('hatecrimes.db')
df4 = pd.read_sql_query("SELECT * FROM crimes", cnx)
    
In [6]:
    
df4['ID'] = df4.index
df4.head()
    
    Out[6]:
In [7]:
    
crimesDF = pd.merge(df3, df4, how='left', left_on='ID', right_on='ID')
crimesDF.head()
    
    Out[7]:
In [8]:
    
columns = ['Agency_name_y', 'Agency_type_y', 'State_y', 'state', 'agType', 'agName']
crimesDF.drop(columns, inplace=True, axis=1)
crimesDF.rename(columns={'State_x': 'State', 'Agency_type_x': 'Agency_Type', 'Agency_name_x':'Agency_Name',
                         'vRace': 'Race', 'vRel': 'Religion', 'vSexOr': 'SexOrient',
                          }, inplace=True)
crimesDF.head()
    
    Out[8]:
In [9]:
    
crimesDF.to_csv("hate_crimes_13.csv", sep=',', encoding='utf-8')
    
In [10]:
    
columns = ['Agency_Type', 'quarter_1', 'quarter_2', 'quarter_3', 'quarter_4']
agencyDF = crimesDF[columns].copy()
agencyDF['annual'] = agencyDF.loc[:,('quarter_1', 'quarter_2', 'quarter_3', 'quarter_4')].sum(axis=1)
agencyDF.head()
    
    Out[10]:
In [11]:
    
# You can select data for only Cities subset for Indexing on Agency_type
agencyDF[agencyDF.Agency_Type == 'Cities'].head()
    
    Out[11]:
In [12]:
    
agencyDF.groupby('Agency_Type')
    
    Out[12]:
In [13]:
    
agencyDF[['quarter_1', 'quarter_2', 'quarter_3', 'quarter_4', 'annual']].groupby(agencyDF['Agency_Type']).sum()
    
    Out[13]:
In [14]:
    
output = []
for name, group in agencyDF.groupby('Agency_Type'):
    prctQ1 = (group['quarter_1'].sum() / group['annual'].sum())*100
    prctQ2 = (group['quarter_2'].sum() / group['annual'].sum())*100
    prctQ3 = (group['quarter_3'].sum() / group['annual'].sum())*100
    prctQ4 = (group['quarter_4'].sum() / group['annual'].sum())*100
    
    output.append({'Agency_Type': name, 'Q1':prctQ1, 'Q2':prctQ2, 'Q3':prctQ3, 'Q4':prctQ4})
DF = pd.DataFrame(output)
print("Percentage of Annual Crime by Annual Quarters for 2013")  
print(DF)
    
    
In [15]:
    
agencyDF.groupby('Agency_Type').sum()
    
    Out[15]:
In [16]:
    
columns = ['State', 'Ethnicity', 'Race']
ethnicDF = crimesDF[columns].copy()
ethnicDF['Ethnic'] = ethnicDF.loc[:,('Ethnicity', 'Race')].sum(axis=1)
ethnicDF.head()
    
    Out[16]:
In [17]:
    
# Group ethnicDF by State, creates GroupBy object
ethnicDF.groupby('State').sum()
    
    Out[17]:
In [18]:
    
columns = ['Disability', 'quarter_1', 'quarter_2', 'quarter_3', 'quarter_4']
disabledDF = crimesDF[columns].copy()
disabledDF['tot2013'] = disabledDF.loc[:,('quarter_1', 'quarter_2', 'quarter_3', 'quarter_4')].sum(axis=1)
disabledDF.head()
    
    Out[18]:
In [19]:
    
disabled = disabledDF['Disability'].sum()
disabled
    
    Out[19]:
In [20]:
    
total = disabledDF['tot2013'].sum()
total
    
    Out[20]:
In [21]:
    
prctDisabled = (disabled/total)*100
prctDisabled
    
    Out[21]: