In [296]:
import numpy as np 
import pandas as pd
import datetime
import matplotlib as plt
import re
%matplotlib inline

In [297]:
df = pd.read_csv('violations.csv', nrows=1000, parse_dates=['Issue Date'])

In [298]:
df.columns


Out[298]:
Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
       'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
       'Violation Time', 'Time First Observed', 'Violation County',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parking Violation'],
      dtype='object')

In [299]:
df.dtypes


Out[299]:
Summons Number                                int64
Plate ID                                     object
Registration State                           object
Plate Type                                   object
Issue Date                           datetime64[ns]
Violation Code                                int64
Vehicle Body Type                            object
Vehicle Make                                 object
Issuing Agency                               object
Street Code1                                  int64
Street Code2                                  int64
Street Code3                                  int64
Vehicle Expiration Date                       int64
Violation Location                          float64
Violation Precinct                            int64
Issuer Precinct                               int64
Issuer Code                                   int64
Issuer Command                               object
Issuer Squad                                  int64
Violation Time                               object
Time First Observed                          object
Violation County                             object
Violation In Front Of Or Opposite            object
House Number                                 object
Street Name                                  object
Intersecting Street                          object
Date First Observed                           int64
Law Section                                   int64
Sub Division                                 object
Violation Legal Code                        float64
Days Parking In Effect                       object
From Hours In Effect                         object
To Hours In Effect                           object
Vehicle Color                                object
Unregistered Vehicle?                         int64
Vehicle Year                                  int64
Meter Number                                 object
Feet From Curb                                int64
Violation Post Code                         float64
Violation Description                       float64
No Standing or Stopping Violation           float64
Hydrant Violation                           float64
Double Parking Violation                    float64
dtype: object

1. I want to make sure my Plate ID is a string. Can't lose the leading zeroes!


In [300]:
# checks data type of each value in series Plate ID by printing if type does not equal string
# all values are strings
for x in df['Plate ID']: 
    if type(x) != str:
        print(type(x))

2. I don't think anyone's car was built in 0AD. Discard the '0's as NaN.


In [301]:
df['Vehicle Year'] = df['Vehicle Year'].replace(0, np.nan)
df['Vehicle Year'] = pd.to_datetime(df['Vehicle Year'], format='%Y', errors = 'coerce')

3. I want the dates to be dates! Read the read_csv documentation to find out how to make pandas automatically parse dates.


In [302]:
# see where I read in csv for inclusion of parse_date arg

4. "Date first observed" is a pretty weird column, but it seems like it has a date hiding inside. Using a function with .apply, transform the string (e.g. "20140324") into a Python date. Make the 0's show up as NaN.


In [303]:
df['Date First Observed'] = df['Date First Observed'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d', errors='coerce'))

5. "Violation time" is... not a time. Make it a time.


In [304]:
def remove_char(x):
    x = x[:-1]
    return x
def insert_char(x):
    x = x[:2] + ':' + x[2:]
    return x

In [305]:
# removes 'A' and 'P' from original values in Violation Time
df['Violation Time2'] = df['Violation Time'].apply(lambda x: remove_char(str(x)))
# inserts colon in to string 
df['Violation Time3'] = df['Violation Time2'].apply(lambda x: insert_char(str(x)))

In [306]:
#makes values datetime format, allowing .to_datetime to infer format from string
df['Violation Time4'] = df['Violation Time3'].apply(lambda x: pd.to_datetime(str(x), infer_datetime_format=True, errors='coerce'))
#selects only time from timestamp 
df['Violation Time'] = df['Violation Time4'].dt.time

In [307]:
# deletes temporary variables used to make violation time a time 
df=df.drop(df.columns[[43, 44, 45]], axis=1)
df.columns


Out[307]:
Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
       'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
       'Violation Time', 'Time First Observed', 'Violation County',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parking Violation'],
      dtype='object')

6. There sure are a lot of colors of cars, too bad so many of them are the same. Make "BLK" and "BLACK", "WT" and "WHITE", and any other combinations that you notice.


In [308]:
def clean_color(x): 
    if x == 'BLK': 
        print(x)
        x = x.replace('BLK','BLACK')
        print('Now', x)
    elif x == 'BK': 
        print(x)
        x = x.replace('BK','BLACK')
        print('Now', x)
    elif x == 'W': 
        print(x)
        x = x.replace('W','WHITE')
        print('Now', x)
    elif x == 'WH': 
        print(x)
        x = x.replace('WH','WHITE')
        print('Now', x)
    elif x == 'WHT': 
        print(x)
        x = x.replace('WHT','WHITE')
        print('Now', x)
    elif x == 'WT': 
        print(x)
        x = x.replace('WT','WHITE')
        print('Now', x)
    elif x == 'GY': 
        print(x)
        x = x.replace('GY','GREY')
        print('Now', x)
    elif x == 'GRAY': 
        print(x)
        x = x.replace('GRAY','GREY')
        print('Now', x)
    elif x == 'GRY': 
        print(x)
        x = x.replace('GRY','GREY')
        print('Now', x)
    elif x == 'G/Y': 
        print(x)
        x = x.replace('G/Y','GREY')
        print('Now', x)
    elif x == 'BR': 
        print(x)
        x = x.replace('BR','BROWN')
        print('Now', x)
    elif x == 'BRW': 
        print(x)
        x = x.replace('BRW','BROWN')
        print('Now', x)
    elif x == 'TN': 
        print(x)
        x = x.replace('GY','TAN')
        print('Now', x)
    elif x == 'RD': 
        print(x)
        x = x.replace('RD','RED')
        print('Now', x)
    elif x == 'BL': 
        print(x)
        x = x.replace('BL','BLUE')
        print('Now', x)
    elif x == 'BLU': 
        print(x)
        x = x.replace('BLU','BLUE')
        print('Now', x)
    elif x == 'SIL':
        print(x)
        x = x.replace('SIL','SILVER')
        print('Now', x)  
    elif x == 'SILVE':
        print(x)
        x = x.replace('SILVE','SILVER')
        print('Now', x) 
    elif x == 'BURGA':
        print(x)
        x = x.replace('BURGA','BURGANDY')
        print('Now', x) 
    elif x == 'PURPL':
        print(x)
        x = x.replace('PURPL','PURPLE')
        print('Now', x) 
    elif x == 'PUR':
        print(x)
        x = x.replace('PUR','PURPLE')
        print('Now', x) 
    elif x == 'YELLO':
        print(x)
        x = x.replace('YELLO','YELLOW')
        print('Now', x) 
    elif x == 'YW':
        print(x)
        x = x.replace('YW','YELLOW')
        print('Now', x)
    elif x == 'OR':
        print(x)
        x = x.replace('OR','ORANGE')
        print('Now', x)
    elif x == 'GL':
        print(x)
        x = x.replace('GL','GOLD')
        print('Now', x)
    elif x == 'GR':
        print(x)
        x = x.replace('GR','GREEN')
        print('Now', x)
    elif x == 'GRN':
        print(x)
        x = x.replace('GRN','GREEN')
        print('Now', x)
    elif x == 'G':
        print(x)
        x = x.replace('G','GREEN')
        print('Now', x)
    return x

In [309]:
df['Vehicle Color'] = df['Vehicle Color'].apply(lambda x: clean_color(x))
df['Vehicle Color']


GY
Now GREY
WH
Now WHITE
WH
Now WHITE
BR
Now BROWN
RD
Now RED
GRAY
Now GREY
WH
Now WHITE
GY
Now GREY
GRAY
Now GREY
TN
Now TN
BK
Now BLACK
WHT
Now WHITE
WH
Now WHITE
GR
Now GREEN
RD
Now RED
GY
Now GREY
GY
Now GREY
BL
Now BLUE
BK
Now BLACK
BLU
Now BLUE
WHT
Now WHITE
GRAY
Now GREY
BR
Now BROWN
GY
Now GREY
BK
Now BLACK
BLK
Now BLACK
SILVE
Now SILVER
BK
Now BLACK
GRAY
Now GREY
WH
Now WHITE
GY
Now GREY
SILVE
Now SILVER
BK
Now BLACK
SILVE
Now SILVER
YELLO
Now YELLOW
GRY
Now GREY
WHT
Now WHITE
WHT
Now WHITE
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
GRY
Now GREY
SILVE
Now SILVER
YELLO
Now YELLOW
BLK
Now BLACK
BL
Now BLUE
BLK
Now BLACK
BL
Now BLUE
GRAY
Now GREY
GRAY
Now GREY
GY
Now GREY
BK
Now BLACK
BK
Now BLACK
GY
Now GREY
GY
Now GREY
GRAY
Now GREY
WH
Now WHITE
GRAY
Now GREY
WH
Now WHITE
WH
Now WHITE
OR
Now ORANGE
BL
Now BLUE
W
Now WHITE
WHT
Now WHITE
GRAY
Now GREY
BK
Now BLACK
SIL
Now SILVER
WHT
Now WHITE
WHT
Now WHITE
WHT
Now WHITE
GRAY
Now GREY
BLK
Now BLACK
BLK
Now BLACK
BLK
Now BLACK
GR
Now GREEN
BK
Now BLACK
WH
Now WHITE
BLK
Now BLACK
YELLO
Now YELLOW
YELLO
Now YELLOW
WHT
Now WHITE
BL
Now BLUE
PURPL
Now PURPLE
G/Y
Now GREY
GY
Now GREY
BK
Now BLACK
WHT
Now WHITE
BRW
Now BROWN
SILVE
Now SILVER
SILVE
Now SILVER
WHT
Now WHITE
GRY
Now GREY
BLK
Now BLACK
GRY
Now GREY
GRY
Now GREY
GRY
Now GREY
BLK
Now BLACK
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
YELLO
Now YELLOW
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
PURPL
Now PURPLE
SILVE
Now SILVER
GRAY
Now GREY
SILVE
Now SILVER
BLK
Now BLACK
SILVE
Now SILVER
BLK
Now BLACK
SILVE
Now SILVER
BLK
Now BLACK
BLK
Now BLACK
BLK
Now BLACK
GRAY
Now GREY
YELLO
Now YELLOW
WH
Now WHITE
GY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
BLK
Now BLACK
SILVE
Now SILVER
BK
Now BLACK
BLK
Now BLACK
GY
Now GREY
GRAY
Now GREY
BL
Now BLUE
BLK
Now BLACK
SILVE
Now SILVER
BL
Now BLUE
GRY
Now GREY
WH
Now WHITE
RD
Now RED
WH
Now WHITE
BK
Now BLACK
GY
Now GREY
BLK
Now BLACK
WHT
Now WHITE
BL
Now BLUE
WH
Now WHITE
BK
Now BLACK
WHT
Now WHITE
BR
Now BROWN
WHT
Now WHITE
BLK
Now BLACK
GRY
Now GREY
BL
Now BLUE
BK
Now BLACK
GY
Now GREY
WH
Now WHITE
WHT
Now WHITE
WHT
Now WHITE
TN
Now TN
WHT
Now WHITE
WHT
Now WHITE
BLK
Now BLACK
GRY
Now GREY
WHT
Now WHITE
GRY
Now GREY
BK
Now BLACK
WH
Now WHITE
BK
Now BLACK
WH
Now WHITE
GY
Now GREY
G
Now GREEN
BK
Now BLACK
WH
Now WHITE
W
Now WHITE
GY
Now GREY
BK
Now BLACK
YELLO
Now YELLOW
G
Now GREEN
GY
Now GREY
BL
Now BLUE
GY
Now GREY
BLK
Now BLACK
BK
Now BLACK
BK
Now BLACK
GY
Now GREY
GY
Now GREY
WH
Now WHITE
BK
Now BLACK
BR
Now BROWN
WH
Now WHITE
BR
Now BROWN
BL
Now BLUE
GR
Now GREEN
BK
Now BLACK
GL
Now GOLD
GY
Now GREY
SILVE
Now SILVER
GY
Now GREY
BLK
Now BLACK
WHT
Now WHITE
BR
Now BROWN
RD
Now RED
BK
Now BLACK
BK
Now BLACK
GY
Now GREY
BLK
Now BLACK
BR
Now BROWN
BR
Now BROWN
BL
Now BLUE
BLK
Now BLACK
GY
Now GREY
BL
Now BLUE
GY
Now GREY
GY
Now GREY
BK
Now BLACK
BL
Now BLUE
BR
Now BROWN
WHT
Now WHITE
BK
Now BLACK
BLK
Now BLACK
GY
Now GREY
GY
Now GREY
BR
Now BROWN
GL
Now GOLD
BK
Now BLACK
BK
Now BLACK
GY
Now GREY
GY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
YELLO
Now YELLOW
BK
Now BLACK
SILVE
Now SILVER
SILVE
Now SILVER
SILVE
Now SILVER
BK
Now BLACK
WH
Now WHITE
GRAY
Now GREY
GRAY
Now GREY
BK
Now BLACK
BK
Now BLACK
GY
Now GREY
WH
Now WHITE
GY
Now GREY
YW
Now YELLOW
YELLO
Now YELLOW
WH
Now WHITE
BLK
Now BLACK
SILVE
Now SILVER
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
BLK
Now BLACK
BLK
Now BLACK
BK
Now BLACK
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
BLK
Now BLACK
BLK
Now BLACK
GRAY
Now GREY
BLK
Now BLACK
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
BK
Now BLACK
GY
Now GREY
BK
Now BLACK
BLK
Now BLACK
GY
Now GREY
RD
Now RED
GY
Now GREY
GRY
Now GREY
WHT
Now WHITE
RD
Now RED
TN
Now TN
GRN
Now GREEN
BK
Now BLACK
GY
Now GREY
GRN
Now GREEN
GRAY
Now GREY
WHT
Now WHITE
BK
Now BLACK
GY
Now GREY
RD
Now RED
WH
Now WHITE
RD
Now RED
BK
Now BLACK
RD
Now RED
GR
Now GREEN
BK
Now BLACK
GR
Now GREEN
GRY
Now GREY
BLK
Now BLACK
WHT
Now WHITE
PUR
Now PURPLE
BLK
Now BLACK
WHT
Now WHITE
BLK
Now BLACK
WHT
Now WHITE
WHT
Now WHITE
GRY
Now GREY
BLK
Now BLACK
GRY
Now GREY
BLK
Now BLACK
GRY
Now GREY
WHT
Now WHITE
WHT
Now WHITE
WH
Now WHITE
BK
Now BLACK
GY
Now GREY
WHT
Now WHITE
GY
Now GREY
BL
Now BLUE
BLK
Now BLACK
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
GRAY
Now GREY
BLK
Now BLACK
SILVE
Now SILVER
SILVE
Now SILVER
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
BLK
Now BLACK
SILVE
Now SILVER
SILVE
Now SILVER
GRAY
Now GREY
SILVE
Now SILVER
SILVE
Now SILVER
YELLO
Now YELLOW
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
GRAY
Now GREY
YELLO
Now YELLOW
G
Now GREEN
GRAY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
GRAY
Now GREY
GRAY
Now GREY
WH
Now WHITE
GRAY
Now GREY
WH
Now WHITE
WH
Now WHITE
GY
Now GREY
GY
Now GREY
GY
Now GREY
BL
Now BLUE
WH
Now WHITE
GY
Now GREY
WH
Now WHITE
GY
Now GREY
BK
Now BLACK
YELLO
Now YELLOW
GRAY
Now GREY
SILVE
Now SILVER
SILVE
Now SILVER
PURPL
Now PURPLE
BLK
Now BLACK
BL
Now BLUE
BLK
Now BLACK
SILVE
Now SILVER
BLK
Now BLACK
GRAY
Now GREY
BL
Now BLUE
BURGA
Now BURGANDY
YELLO
Now YELLOW
GRY
Now GREY
GRAY
Now GREY
SILVE
Now SILVER
Out[309]:
0          GREY
1         WHITE
2           NaN
3         WHITE
4         BROWN
5           RED
6            GN
7         WHITE
8         WHITE
9         BLACK
10         BLUE
11          TAN
12         GOLD
13        WHITE
14        GREEN
15        WHITE
16         GREY
17        GREEN
18        WHITE
19        WHITE
20          TAN
21         GREY
22         GREY
23           TN
24        BLACK
25        WHITE
26        WHITE
27        GREEN
28          RED
29         GREY
         ...   
970       BLACK
971      SILVER
972       BLACK
973        GREY
974        GREY
975        BLUE
976        GOLD
977         RED
978    BURGANDY
979       BLACK
980       WHITE
981       BLACK
982        BLUE
983        GOLD
984      YELLOW
985        BLUE
986        GREY
987        BLUE
988         RED
989       GREEN
990       GREEN
991       BLACK
992        GREY
993        GREY
994        GREY
995       WHITE
996        BLUE
997      SILVER
998       BLACK
999        GREY
Name: Vehicle Color, dtype: object

In [310]:
#check for remaining color abbreviations
for x in df['Vehicle Color']:
    if len(str(x)) <= 2: 
        print(x)


GN
TN
GN
TN
IL
TN

7. Join the data with the Parking Violations Code dataset from the NYC Open Data site.


In [311]:
df2 = pd.read_csv('DOF_Parking_Violation_Codes.csv')

In [312]:
df2


Out[312]:
CODE DEFINITION Manhattan  96th St. & below All Other Areas
0 10 Stopping, standing or parking where a sign, st... $115 $115
1 11 Hotel Loading/Unloading: Standing or parking w... $115 $115
2 12 Snow Emergency: Standing or parking where stan... $95 $95
3 13 Taxi Stand: Standing or parking where standing... $115 $115
4 14 General No Standing: Standing or parking where... $115 $115
5 16 Truck Loading/Unloading: Standing or parking w... $95 $95
6 17 Authorized Vehicles Only: Standing or parking ... $95 $95
7 18 Bus Lane: Standing or parking where standing i... $115 $115
8 19 Bus Stop: Standing or parking where standing i... $115 $115
9 20 General No Parking: No parking where parking i... $65 $60
10 21 Street Cleaning: No parking where parking is n... $65 $45
11 22 Hotel Loading/Unloading: No parking where park... $60 $60
12 23 Taxi Stand: No parking where parking is not al... $65 $60
13 24 Authorized Vehicles Only: No parking where par... $65 $60
14 25 Standing at a commuter van stop, other than te... $115 $115
15 26 Standing at a for-hire vehicle stop, other tha... $115 $115
16 27 No parking in a handicapped zone (off-street o... $180 $180
17 28 Overtime standing (diplomat) $95 $95
18 31 Standing of a non-commercial vehicle in a comm... $115 $115
19 32 Parking at a broken or missing meter for longe... $65 $35
20 33 "Feeding Meter" -- parking in a metered space ... $65 $35
21 34 Expired Meter -- parking in a metered space wh... $65 $35
22 35 Parking in a meter space for the purpose of di... $65 $35
23 37-38 Muni Meter --\n(37) Parking in excess of the a... $65 $35
24 39 Parking for longer than the maximum time permi... $65 $60
25 69 Failing to show a muni-meter receipt, commerci... $65 $65
26 40 Stopping, standing or parking closer than 15 f... $115 $115
27 42 Parking in a Muni Metered space in a commercia... $65 $35
28 43 Parking in a commercial metered zone in which ... $65 $35
29 44 Parking in a commercial metered zone for longe... $65 $35
... ... ... ... ...
54 71 Standing or parking a vehicle without showing ... $65 $65
55 72 Standing or parking a vehicle with NY Plates a... $65 $65
56 73 Standing or parking a vehicle showing an expir... $65 $65
57 74 Standing or parking a vehicle without properly... $65 $65
58 75 Standing or parking a vehicle in which the Lic... $65 $65
59 77 Parking a bus, unless allowed by signs. A char... $65 $45
60 78 Parking a Commercial Vehicle on a residential ... $65 $65
61 79 For a bus without passengers, waiting at a cur... $115 $115
62 80 Standing or parking a vehicle without head lam... $60 $45
63 81 No standing except diplomat $95 $95
64 82 Standing or parking a Commercial Vehicle unles... $115 $115
65 83 Standing or parking a vehicle which is not pro... $65 $65
66 84 Parking a Commercial Vehicle on any city stree... $65 $45
67 85 Parking a Commercial Vehicle more than 3 hours... $65 $65
68 86 Standing or parking a vehicle to make pickups,... $115 $115
69 89 Standing or parking a vehicle in the Garment D... $115 $115
70 91 Parking in order to sell a vehicle by a person... $65 $45
71 92 Parking in order to wash, grease, or repair a ... $65 $45
72 93 Stopping, standing or parking on paved roadway... $65 $65
73 94 Vehicle Release Agreement code associated with... 100\n(Regular Tow, plus violation fine) $200 (Heavy Tow, plus violation fine)
74 96 Standing or parking within 50 feet of the near... $95 $95
75 97 Parking in a vacant lot. A vehicle may be park... $65 $45
76 98 Standing or parking in front of a public or pr... $95 $95
77 99 All other parking, standing or stopping violat... vary vary
78 04 Vehicles parked illegally south of Houston Str... $115 NaN
79 05 Failure to make a right turn from a bus lane. $115 $115
80 06 Parking a tractor-trailer on a residential str... 1st Offense - $265\n2nd Offense - $515 2nd Offense - $265\n2nd Offense - $515
81 07 Vehicles photographed going through a red ligh... $50 $50
82 08 Vehicle idling in a restricted area. $115 $115
83 09 Blocking an Intersection: Obstructing traffic ... $115 $115

84 rows × 4 columns


In [313]:
# change format to string to match violation codes data set format 
df['Violation Code'] = df['Violation Code'].apply(lambda x: str(x).strip())

In [314]:
# check formatting and string length
for x in df['Violation Code']:
    if type(x) != str: 
        print(type(x))
    if len(x) > 2: 
        print(x)

In [315]:
df3 = pd.merge(df, df2, left_on='Violation Code', right_on='CODE', how='left', indicator=True)

In [316]:
# unsuccessful merges bc Violation Code 41 does not exist in Violation codes data.
# and bc 38 in documented at 37-38 in Violation codes data 
df3[df3['_merge']=='left_only']


Out[316]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation CODE DEFINITION Manhattan  96th St. & below All Other Areas _merge
4 1283294187 91648MC NY COM 2013-08-08 41 TRLR GMC P 37240 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
37 1288360800 HYM4303 PA PAS 2013-08-06 41 SDN MITSU P 29090 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
87 1294713218 GFC8215 NY PAS 2013-08-13 38 SUBN CHEVR P 0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
88 1294713220 6417CO CT COM 2013-08-13 38 VAN CHEVR P 0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
217 1321932730 900ZIH CA PAS 2013-08-12 41 SUBN AUDI S 0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
501 1295572450 GBJ3633 NY PAS 2013-07-20 37 SDN NISSA P 0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
581 1309169688 98981 NY COM 2013-08-01 38 VAN DODGE P 34770 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
885 1334596554 JFE4755 PA PAS 2013-08-03 41 SUBN CHEVR X 11710 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
930 1334738531 U3261 99 COM 2013-08-22 6 TRLR STRIC S 73570 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only
981 1335087930 GGB3579 NY PAS 2013-07-27 41 SDN HYUND P 73110 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN left_only

10 rows × 48 columns

8. How much money did NYC make off of parking violations?


In [317]:
# removes unsuccessful merges
df4 = df3[df3['_merge'] !='left_only']

In [318]:
# Create boolean variable to indicate 0/1: Parking violation 
df4['Parking Violation'] = df4['DEFINITION'].str.contains('[Pp]arking') | df4['DEFINITION'].str.contains('[Pp]arked')
df4['Parking Violation'].value_counts()
df4[df4['Parking Violation'] == False]


/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Out[318]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation CODE DEFINITION Manhattan  96th St. & below All Other Areas _merge Parking Violation
32 1287518126 WMI CT PAS 2013-07-24 31 SUBN ME/BE T 34510 ... NaN NaN NaN NaN 31 Standing of a non-commercial vehicle in a comm... $115 $115 both False
51 6020309034 GEW4836 99 PAS 2013-07-18 94 SDN ACURA X 0 ... NaN NaN NaN NaN 94 Vehicle Release Agreement code associated with... 100\n(Regular Tow, plus violation fine) $200 (Heavy Tow, plus violation fine) both False
57 1293094389 GDS9810 NJ PAS 2013-06-20 31 SDN BMW P 24890 ... NaN NaN NaN NaN 31 Standing of a non-commercial vehicle in a comm... $115 $115 both False
378 1333131951 947WT5 MA PAS 2013-07-22 31 SDN NISSA T 17790 ... NaN NaN NaN NaN 31 Standing of a non-commercial vehicle in a comm... $115 $115 both False
384 1333509005 447ZRW CT PAS 2013-07-20 31 SDN FORD T 34570 ... NaN NaN NaN NaN 31 Standing of a non-commercial vehicle in a comm... $115 $115 both False
390 6020310784 447ZRW CT PAS 2013-07-20 94 SDN FORD X 0 ... NaN NaN NaN NaN 94 Vehicle Release Agreement code associated with... 100\n(Regular Tow, plus violation fine) $200 (Heavy Tow, plus violation fine) both False
402 1333711438 ZTD55E NJ PAS 2013-07-06 31 SDN CHEVR T 34570 ... NaN NaN NaN NaN 31 Standing of a non-commercial vehicle in a comm... $115 $115 both False
520 6020314583 87336JV PA PAS 2013-07-26 94 SDN TOYOT X 0 ... NaN NaN NaN NaN 94 Vehicle Release Agreement code associated with... 100\n(Regular Tow, plus violation fine) $200 (Heavy Tow, plus violation fine) both False
625 6020342748 HWD923 SC PAS 2013-07-22 94 SUBN KIA X 0 ... NaN NaN NaN NaN 94 Vehicle Release Agreement code associated with... 100\n(Regular Tow, plus violation fine) $200 (Heavy Tow, plus violation fine) both False
746 1333717192 402YBY CT PAS 2013-07-13 31 SDN NISSA T 34570 ... NaN NaN NaN NaN 31 Standing of a non-commercial vehicle in a comm... $115 $115 both False
747 1333717209 H77CXW NJ PAS 2013-07-13 31 SDN HONDA T 34570 ... NaN NaN NaN NaN 31 Standing of a non-commercial vehicle in a comm... $115 $115 both False

11 rows × 49 columns


In [319]:
df4['Street Name'].value_counts()
#create empty list
above_96_lst=[]
#create bool object 
for x in df4['Street Name']: 
    if re.search(r'\S \d \S\w ', str(x)):
        above_96 =  False
        above_96_lst.append(above_96)
    elif re.search(r'\d[[Aa]-[Zz]][[Aa]-[Zz]] ', str(x)):
        above_96 =  False
        above_96_lst.append(above_96)
#         print(x, above_96)
    elif re.search(r'\w\s\d\d\s', str(x)):
        above_96 =  False
        above_96_lst.append(above_96)
#         print(x, above_96)
    elif re.search(r'[A-Z]\s\d\d[A-Z][A-Z] [A-Z][A-Z]', str(x)):
        above_96 =  False
        above_96_lst.append(above_96)
#         print(x, above_96)
    elif re.search(r'[0-2][A-Z][A-Z] AVE', str(x)):
        above_96 =  False
        above_96_lst.append(above_96)
#         print(x, above_96)
    else: 
        above_96 =  True
        above_96_lst.append(above_96)
#         print(x, above_96)

# above_96_lst.count(False)
df4['Above_96'] = above_96_lst


/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/ipykernel/__main__.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [320]:
# rename column so easier to reference 
df4.rename(columns={"Manhattan\xa0 96th St. & below": "Manhattan_Below_96"}, inplace=True)


/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/pandas/core/frame.py:2754: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)

In [321]:
df4[['Violation Code', 'Street Name', 'Above_96', 'Manhattan_Below_96', 'All Other Areas', 'Parking Violation']].head(20)


Out[321]:
Violation Code Street Name Above_96 Manhattan_Below_96 All Other Areas Parking Violation
0 46 W 175 ST True $115 $115 True
1 46 W 177 ST True $115 $115 True
2 46 W 163 ST True $115 $115 True
3 46 W 176 ST True $115 $115 True
5 14 W 175 ST True $115 $115 True
6 14 W 175 ST True $115 $115 True
7 24 N.PORTLAND AVE True $65 $60 True
8 24 N.PORTLAND AVE True $65 $60 True
9 24 760 BROADWAY ,BROOKL True $65 $60 True
10 24 760 BROADWAY BROOKLY True $65 $60 True
11 24 760 BROADWAY BROOKLY True $65 $60 True
12 24 760 BDWAY BKLYN True $65 $60 True
13 24 BWY True $65 $60 True
14 14 W 21 STREET False $115 $115 True
15 20 CONGRESS ST True $65 $60 True
16 14 PROSPECT PKWAY True $115 $115 True
17 20 N/S DEKALB AVE True $65 $60 True
18 14 ROGERS AVE True $115 $115 True
19 20 DECATUR ST True $65 $60 True
20 17 N/S LINDEN ST True $95 $95 True

In [322]:
# remove dollar signs
df4['Manhattan_Below_96'] = df4['Manhattan_Below_96'].str.replace('$', '')

# replace long descriptions of cost with only the amount
# df4['Manhattan_Below_96'] = df4['Manhattan_Below_96'].str.replace('100\n(Regular Tow, plus violation fine)', '100')
df4.ix[df4['Manhattan_Below_96'] == '100\n(Regular Tow, plus violation fine)', 'Manhattan_Below_96'] = '100'

# replace 'vary' with zero
df4['Manhattan_Below_96'] = df4['Manhattan_Below_96'].str.replace('vary', '0')

# change to int 
df4['Manhattan_Below_96'].apply(lambda x: int(x))


/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/pandas/core/indexing.py:465: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[322]:
0      115
1      115
2      115
3      115
5      115
6      115
7       65
8       65
9       65
10      65
11      65
12      65
13      65
14     115
15      65
16     115
17      65
18     115
19      65
20      95
21     115
22     115
23     115
24     115
25     115
26     115
27      65
28     115
29     115
30      65
      ... 
969     95
970    115
971     65
972     65
973     65
974     65
975    115
976     65
977     65
978     65
979     65
980     65
982    115
983     65
984    115
985     65
986     65
987    115
988    115
989    115
990    115
991     65
992     65
993    115
994    115
995    115
996     65
997     65
998     65
999     65
Name: Manhattan_Below_96, dtype: int64

In [323]:
df4['All Other Areas'] = df4['All Other Areas'].str.replace('$', '')
df4['All Other Areas'] = df4['All Other Areas'].str.replace('vary', '0')
df4.ix[df4['All Other Areas'] == '200 (Heavy Tow, plus violation fine)', 'All Other Areas'] = '200'

df4['All Other Areas'].apply(lambda x: int(x))


/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/pandas/core/indexing.py:465: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
Out[323]:
0      115
1      115
2      115
3      115
5      115
6      115
7       60
8       60
9       60
10      60
11      60
12      60
13      60
14     115
15      60
16     115
17      60
18     115
19      60
20      95
21     115
22     115
23     115
24     115
25     115
26     115
27      60
28     115
29     115
30      60
      ... 
969     95
970    115
971     60
972     60
973     60
974     60
975    115
976     60
977     60
978     60
979     60
980     60
982    115
983     60
984    115
985     60
986     60
987    115
988    115
989    115
990    115
991     60
992     60
993    115
994    115
995    115
996     60
997     60
998     60
999     60
Name: All Other Areas, dtype: int64