In [2]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
import os
In [4]:
cwd = os.getcwd()
In [5]:
print(cwd)
In [6]:
ls ../
police.csv is the Rhode Island dataset from the Stanford Open Policing Project, made available under the Open Data Commons Attribution License.
// ted.csv is the TED Talks dataset from Kaggle Datasets, made available under the CC BY-NC-SA 4.0 license.
In [7]:
df = pd.read_csv('../police.csv')
// added the ../ to go back in folder structure to get to the csv file
In [8]:
df.head()
Out[8]:
In [9]:
df.shape
Out[9]:
In [10]:
df.dtypes
Out[10]:
In [11]:
df.isnull().head()
Out[11]:
In [12]:
df.isnull().sum()
Out[12]:
In [13]:
df.drop('county_name', axis='columns', inplace=True)
In [14]:
df.shape
Out[14]:
(alternative: df.dropna(axis='columns', how='all') // drop al columns with only missing values (dropna command))
In [15]:
df[df.violation == 'Speeding'].driver_gender.value_counts()
Out[15]:
In [16]:
df[df.violation == 'Speeding'].driver_gender.value_counts(normalize=True)
Out[16]:
In [17]:
df[df.driver_gender == 'M'].violation.value_counts(normalize=True)
Out[17]:
In [18]:
df[df.driver_gender == 'F'].violation.value_counts(normalize=True)
Out[18]:
In [19]:
df.groupby('driver_gender').violation.value_counts(normalize=True)
Out[19]:
e.g. does gender affect who gets searched during a stop?
In [20]:
df.groupby('driver_gender').search_conducted.value_counts(normalize=True)
Out[20]:
(alternative: df.groupby('driver_gender').search_conducted.mean() //possible because it is a boolean)
In [21]:
df.groupby(['violation','driver_gender']).search_conducted.value_counts(normalize=True)
Out[21]:
Is is possible that search_type is so often missing, because the search (dependency) has not been performed...?
In [22]:
df.search_conducted.value_counts()
Out[22]:
In [23]:
# same number as search type missing..
In [24]:
df[df.search_conducted == False].search_type.value_counts()
Out[24]:
In [25]:
#empty, so this means each time search_conducted is empty, search_type is NaN. Nan here is dropped automatically by pandas.
In [26]:
#to make the NaN show in valuecount;
df[df.search_conducted == False].search_type.value_counts(dropna=False)
Out[26]:
During a search, how often is the driver frisked </br>
caveat; frisking is in multiple categories..
In [27]:
df.search_type.value_counts()
Out[27]:
In [28]:
#create new column
df['frisk']=df.search_type.str.contains('Frisk')
# *.str.contains() does the magic
In [29]:
df.frisk.value_counts(dropna=False)
Out[29]:
In [30]:
df.frisk.sum()
Out[30]:
In [31]:
df.frisk.mean() #from all trues (people stopped) precentage of frisks...
Out[31]:
In [32]:
#check frisk column first 3 rows. df.frisk.head(3)
In [33]:
# df.groupby('year..').value_counts etc but first create 'year by stripping from date.. (slice)
In [34]:
df['year']=df.stop_date.str.slice(0, 4)
In [35]:
df.head(3)
Out[35]:
In [36]:
df.groupby('year').search_conducted.value_counts()
Out[36]:
In [37]:
# -> 2012
In [38]:
# better way: (kevin markham)
df.stop_date.str.slice(0, 4).value_counts()
Out[38]:
In [39]:
#done
In [40]:
combined = df.stop_date.str.cat(df.stop_time, sep=' ')
In [41]:
combined.head(3)
Out[41]:
In [42]:
df['stop_datetime']= pd.to_datetime(combined)
In [43]:
df.dtypes
Out[43]:
In [44]:
# because 'datetime' is magic..(recoginzes 'year', month etc);
df.stop_datetime.dt.year.value_counts()
Out[44]:
In [45]:
#nts: datetime does not need time, but is for rest of excercise..
In [46]:
df.head()
Out[46]:
In [47]:
df.drugs_related_stop.mean()
Out[47]:
In [48]:
df.groupby(df.stop_datetime.dt.hour).drugs_related_stop.mean()
Out[48]:
In [49]:
df.groupby(df.stop_datetime.dt.hour).drugs_related_stop.mean().plot()
Out[49]:
In [50]:
df.groupby(df.stop_datetime.dt.hour).drugs_related_stop.count().plot()
Out[50]:
In [51]:
df.stop_datetime.dt.hour.value_counts().plot()
Out[51]:
In [52]:
#fix the plot
df.stop_datetime.dt.hour.value_counts().sort_index().plot()
Out[52]:
In [53]:
#alternatively;
#df.groupby(df.stop_datetime.dt.hour).stop_date.count().plot()
In [56]:
df.stop_duration.head()
Out[56]:
In [57]:
df.stop_duration.value_counts(dropna=False)
Out[57]:
In [ ]:
# df.[(df.stop_duration == '1') | (df.stop_duration == '2')].stop_duration = NaN , Does not work:
In [73]:
df.loc[(df.stop_duration == '1') | (df.stop_duration == '2'), 'stop_duration'] = 'NaN'
In [74]:
df.stop_duration.value_counts(dropna = False)
Out[74]:
In [75]:
import numpy as np
In [77]:
df.loc[df.stop_duration == 'NaN', 'stop_duration'] = np.nan
In [78]:
df.stop_duration.value_counts(dropna = False)
Out[78]:
In [79]:
#alternative method
#df.stop_duration.replace(['1', '2'], value=np.nan, inplace=True)
In [ ]: