In [296]:
import numpy as np
import pandas as pd
import datetime
import matplotlib as plt
import re
%matplotlib inline
In [297]:
df = pd.read_csv('violations.csv', nrows=1000, parse_dates=['Issue Date'])
In [298]:
df.columns
Out[298]:
In [299]:
df.dtypes
Out[299]:
In [300]:
# checks data type of each value in series Plate ID by printing if type does not equal string
# all values are strings
for x in df['Plate ID']:
if type(x) != str:
print(type(x))
In [301]:
df['Vehicle Year'] = df['Vehicle Year'].replace(0, np.nan)
df['Vehicle Year'] = pd.to_datetime(df['Vehicle Year'], format='%Y', errors = 'coerce')
In [302]:
# see where I read in csv for inclusion of parse_date arg
In [303]:
df['Date First Observed'] = df['Date First Observed'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d', errors='coerce'))
In [304]:
def remove_char(x):
x = x[:-1]
return x
def insert_char(x):
x = x[:2] + ':' + x[2:]
return x
In [305]:
# removes 'A' and 'P' from original values in Violation Time
df['Violation Time2'] = df['Violation Time'].apply(lambda x: remove_char(str(x)))
# inserts colon in to string
df['Violation Time3'] = df['Violation Time2'].apply(lambda x: insert_char(str(x)))
In [306]:
#makes values datetime format, allowing .to_datetime to infer format from string
df['Violation Time4'] = df['Violation Time3'].apply(lambda x: pd.to_datetime(str(x), infer_datetime_format=True, errors='coerce'))
#selects only time from timestamp
df['Violation Time'] = df['Violation Time4'].dt.time
In [307]:
# deletes temporary variables used to make violation time a time
df=df.drop(df.columns[[43, 44, 45]], axis=1)
df.columns
Out[307]:
In [308]:
def clean_color(x):
if x == 'BLK':
print(x)
x = x.replace('BLK','BLACK')
print('Now', x)
elif x == 'BK':
print(x)
x = x.replace('BK','BLACK')
print('Now', x)
elif x == 'W':
print(x)
x = x.replace('W','WHITE')
print('Now', x)
elif x == 'WH':
print(x)
x = x.replace('WH','WHITE')
print('Now', x)
elif x == 'WHT':
print(x)
x = x.replace('WHT','WHITE')
print('Now', x)
elif x == 'WT':
print(x)
x = x.replace('WT','WHITE')
print('Now', x)
elif x == 'GY':
print(x)
x = x.replace('GY','GREY')
print('Now', x)
elif x == 'GRAY':
print(x)
x = x.replace('GRAY','GREY')
print('Now', x)
elif x == 'GRY':
print(x)
x = x.replace('GRY','GREY')
print('Now', x)
elif x == 'G/Y':
print(x)
x = x.replace('G/Y','GREY')
print('Now', x)
elif x == 'BR':
print(x)
x = x.replace('BR','BROWN')
print('Now', x)
elif x == 'BRW':
print(x)
x = x.replace('BRW','BROWN')
print('Now', x)
elif x == 'TN':
print(x)
x = x.replace('GY','TAN')
print('Now', x)
elif x == 'RD':
print(x)
x = x.replace('RD','RED')
print('Now', x)
elif x == 'BL':
print(x)
x = x.replace('BL','BLUE')
print('Now', x)
elif x == 'BLU':
print(x)
x = x.replace('BLU','BLUE')
print('Now', x)
elif x == 'SIL':
print(x)
x = x.replace('SIL','SILVER')
print('Now', x)
elif x == 'SILVE':
print(x)
x = x.replace('SILVE','SILVER')
print('Now', x)
elif x == 'BURGA':
print(x)
x = x.replace('BURGA','BURGANDY')
print('Now', x)
elif x == 'PURPL':
print(x)
x = x.replace('PURPL','PURPLE')
print('Now', x)
elif x == 'PUR':
print(x)
x = x.replace('PUR','PURPLE')
print('Now', x)
elif x == 'YELLO':
print(x)
x = x.replace('YELLO','YELLOW')
print('Now', x)
elif x == 'YW':
print(x)
x = x.replace('YW','YELLOW')
print('Now', x)
elif x == 'OR':
print(x)
x = x.replace('OR','ORANGE')
print('Now', x)
elif x == 'GL':
print(x)
x = x.replace('GL','GOLD')
print('Now', x)
elif x == 'GR':
print(x)
x = x.replace('GR','GREEN')
print('Now', x)
elif x == 'GRN':
print(x)
x = x.replace('GRN','GREEN')
print('Now', x)
elif x == 'G':
print(x)
x = x.replace('G','GREEN')
print('Now', x)
return x
In [309]:
df['Vehicle Color'] = df['Vehicle Color'].apply(lambda x: clean_color(x))
df['Vehicle Color']
Out[309]:
In [310]:
#check for remaining color abbreviations
for x in df['Vehicle Color']:
if len(str(x)) <= 2:
print(x)
In [311]:
df2 = pd.read_csv('DOF_Parking_Violation_Codes.csv')
In [312]:
df2
Out[312]:
In [313]:
# change format to string to match violation codes data set format
df['Violation Code'] = df['Violation Code'].apply(lambda x: str(x).strip())
In [314]:
# check formatting and string length
for x in df['Violation Code']:
if type(x) != str:
print(type(x))
if len(x) > 2:
print(x)
In [315]:
df3 = pd.merge(df, df2, left_on='Violation Code', right_on='CODE', how='left', indicator=True)
In [316]:
# unsuccessful merges bc Violation Code 41 does not exist in Violation codes data.
# and bc 38 in documented at 37-38 in Violation codes data
df3[df3['_merge']=='left_only']
Out[316]:
In [317]:
# removes unsuccessful merges
df4 = df3[df3['_merge'] !='left_only']
In [318]:
# Create boolean variable to indicate 0/1: Parking violation
df4['Parking Violation'] = df4['DEFINITION'].str.contains('[Pp]arking') | df4['DEFINITION'].str.contains('[Pp]arked')
df4['Parking Violation'].value_counts()
df4[df4['Parking Violation'] == False]
Out[318]:
In [319]:
df4['Street Name'].value_counts()
#create empty list
above_96_lst=[]
#create bool object
for x in df4['Street Name']:
if re.search(r'\S \d \S\w ', str(x)):
above_96 = False
above_96_lst.append(above_96)
elif re.search(r'\d[[Aa]-[Zz]][[Aa]-[Zz]] ', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
elif re.search(r'\w\s\d\d\s', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
elif re.search(r'[A-Z]\s\d\d[A-Z][A-Z] [A-Z][A-Z]', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
elif re.search(r'[0-2][A-Z][A-Z] AVE', str(x)):
above_96 = False
above_96_lst.append(above_96)
# print(x, above_96)
else:
above_96 = True
above_96_lst.append(above_96)
# print(x, above_96)
# above_96_lst.count(False)
df4['Above_96'] = above_96_lst
In [320]:
# rename column so easier to reference
df4.rename(columns={"Manhattan\xa0 96th St. & below": "Manhattan_Below_96"}, inplace=True)
In [321]:
df4[['Violation Code', 'Street Name', 'Above_96', 'Manhattan_Below_96', 'All Other Areas', 'Parking Violation']].head(20)
Out[321]:
In [322]:
# remove dollar signs
df4['Manhattan_Below_96'] = df4['Manhattan_Below_96'].str.replace('$', '')
# replace long descriptions of cost with only the amount
# df4['Manhattan_Below_96'] = df4['Manhattan_Below_96'].str.replace('100\n(Regular Tow, plus violation fine)', '100')
df4.ix[df4['Manhattan_Below_96'] == '100\n(Regular Tow, plus violation fine)', 'Manhattan_Below_96'] = '100'
# replace 'vary' with zero
df4['Manhattan_Below_96'] = df4['Manhattan_Below_96'].str.replace('vary', '0')
# change to int
df4['Manhattan_Below_96'].apply(lambda x: int(x))
Out[322]:
In [323]:
df4['All Other Areas'] = df4['All Other Areas'].str.replace('$', '')
df4['All Other Areas'] = df4['All Other Areas'].str.replace('vary', '0')
df4.ix[df4['All Other Areas'] == '200 (Heavy Tow, plus violation fine)', 'All Other Areas'] = '200'
df4['All Other Areas'].apply(lambda x: int(x))
Out[323]: