In [754]:
import numpy as np
import pandas as pd
import datetime
import matplotlib as plt
import re
%matplotlib inline
In [755]:
df = pd.read_csv('violations.csv', nrows=1000, parse_dates=['Issue Date'])
In [756]:
df.columns
Out[756]:
In [757]:
df.dtypes
Out[757]:
In [758]:
# checks data type of each value in series Plate ID by printing if type does not equal string
# all values are strings
for x in df['Plate ID']:
if type(x) != str:
print(type(x))
In [759]:
df['Vehicle Year'] = df['Vehicle Year'].replace(0, np.nan)
df['Vehicle Year'] = pd.to_datetime(df['Vehicle Year'], format='%Y', errors = 'coerce')
In [760]:
# see where I read in csv for inclusion of parse_date arg
In [761]:
df['Date First Observed'] = df['Date First Observed'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d', errors='coerce'))
In [762]:
def remove_char(x):
x = x[:-1]
return x
def insert_char(x):
x = x[:2] + ':' + x[2:]
return x
In [763]:
# removes 'A' and 'P' from original values in Violation Time
df['Violation Time2'] = df['Violation Time'].apply(lambda x: remove_char(str(x)))
# inserts colon in to string
df['Violation Time3'] = df['Violation Time2'].apply(lambda x: insert_char(str(x)))
In [764]:
#makes values datetime format, allowing .to_datetime to infer format from string
df['Violation Time4'] = df['Violation Time3'].apply(lambda x: pd.to_datetime(str(x), infer_datetime_format=True, errors='coerce'))
#selects only time from timestamp
df['Violation Time'] = df['Violation Time4'].dt.time
In [765]:
# deletes temporary variables used to make violation time a time
df=df.drop(df.columns[[43, 44, 45]], axis=1)
df.columns
Out[765]:
In [766]:
def clean_color(x):
if x == 'BLK':
print(x)
x = x.replace('BLK','BLACK')
print('Now', x)
elif x == 'BK':
print(x)
x = x.replace('BK','BLACK')
print('Now', x)
elif x == 'W':
print(x)
x = x.replace('W','WHITE')
print('Now', x)
elif x == 'WH':
print(x)
x = x.replace('WH','WHITE')
print('Now', x)
elif x == 'WHT':
print(x)
x = x.replace('WHT','WHITE')
print('Now', x)
elif x == 'WT':
print(x)
x = x.replace('WT','WHITE')
print('Now', x)
elif x == 'GY':
print(x)
x = x.replace('GY','GREY')
print('Now', x)
elif x == 'GRAY':
print(x)
x = x.replace('GRAY','GREY')
print('Now', x)
elif x == 'GRY':
print(x)
x = x.replace('GRY','GREY')
print('Now', x)
elif x == 'G/Y':
print(x)
x = x.replace('G/Y','GREY')
print('Now', x)
elif x == 'BR':
print(x)
x = x.replace('BR','BROWN')
print('Now', x)
elif x == 'BRW':
print(x)
x = x.replace('BRW','BROWN')
print('Now', x)
elif x == 'TN':
print(x)
x = x.replace('GY','TAN')
print('Now', x)
elif x == 'RD':
print(x)
x = x.replace('RD','RED')
print('Now', x)
elif x == 'BL':
print(x)
x = x.replace('BL','BLUE')
print('Now', x)
elif x == 'BLU':
print(x)
x = x.replace('BLU','BLUE')
print('Now', x)
elif x == 'SIL':
print(x)
x = x.replace('SIL','SILVER')
print('Now', x)
elif x == 'SILVE':
print(x)
x = x.replace('SILVE','SILVER')
print('Now', x)
elif x == 'BURGA':
print(x)
x = x.replace('BURGA','BURGANDY')
print('Now', x)
elif x == 'PURPL':
print(x)
x = x.replace('PURPL','PURPLE')
print('Now', x)
elif x == 'PUR':
print(x)
x = x.replace('PUR','PURPLE')
print('Now', x)
elif x == 'YELLO':
print(x)
x = x.replace('YELLO','YELLOW')
print('Now', x)
elif x == 'YW':
print(x)
x = x.replace('YW','YELLOW')
print('Now', x)
elif x == 'OR':
print(x)
x = x.replace('OR','ORANGE')
print('Now', x)
elif x == 'GL':
print(x)
x = x.replace('GL','GOLD')
print('Now', x)
elif x == 'GR':
print(x)
x = x.replace('GR','GREEN')
print('Now', x)
elif x == 'GRN':
print(x)
x = x.replace('GRN','GREEN')
print('Now', x)
elif x == 'G':
print(x)
x = x.replace('G','GREEN')
print('Now', x)
return x
In [767]:
df['Vehicle Color'] = df['Vehicle Color'].apply(lambda x: clean_color(x))
df['Vehicle Color']
Out[767]:
In [768]:
#check for remaining color abbreviations
for x in df['Vehicle Color']:
if len(str(x)) <= 2:
print(x)
In [769]:
df2 = pd.read_csv('DOF_Parking_Violation_Codes.csv')
In [770]:
df2
Out[770]:
In [771]:
# change format to string to match violation codes data set format
df['Violation Code'] = df['Violation Code'].apply(lambda x: str(x).strip())
In [772]:
# check formatting and string length
for x in df['Violation Code']:
if type(x) != str:
print(type(x))
if len(x) > 2:
print(x)
In [773]:
df3 = pd.merge(df, df2, left_on='Violation Code', right_on='CODE', how='left', indicator=True)
In [774]:
# unsuccessful merges bc Violation Code 41 does not exist in Violation codes data.
# and bc 38 in documented at 37-38 in Violation codes data
df3[df3['_merge']=='left_only']
Out[774]:
In [775]:
# removes unsuccessful merges
df4 = df3[df3['_merge'] !='left_only']
In [776]:
# Create boolean variable to indicate 0/1: Parking violation
df4['Parking Violation'] = df4['DEFINITION'].str.contains('[Pp]arking') | df4['DEFINITION'].str.contains('[Pp]arked')
df4['Parking Violation'].value_counts()
df4[df4['Parking Violation'] == False]
Out[776]:
In [869]:
# df4.groupby('Parking Violation')[''].sum()[True]
df4['Street Name'].value_counts()
above_96_lst=[]
for x in df4['Street Name']:
if re.search(r'\S \d \S\w ', str(x)):
above_96 = False
above_96_lst.append(above_96)
elif re.search(r'\d[[Aa]-[Zz]][[Aa]-[Zz]] ', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
elif re.search(r'\w\s\d\d\s', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
elif re.search(r'[A-Z]\s\d\d[A-Z][A-Z] [A-Z][A-Z]', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
elif re.search(r'[0-2][A-Z][A-Z] AVE', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
else:
above_96 = True
above_96_lst.append(above_96)
# print(x, above_96)
# above_96_lst.count(False)
df4['Above_96'] = above_96_lst
In [875]:
df4 .h
Out[875]:
In [ ]: