In [1]:
import pandas as pd
dates=['Issue Date', 'Vehicle Expiration Date'] #Importing dates as datetime
col_types={'Plate ID': 'str','Date First Observed':'str'} #Importing Plate ID and the Date First Observed as a string, because it has to be made into a time by a function.
In [2]:
df=pd.read_csv("small-violations.csv",dtype=col_types,parse_dates=dates,na_values={'Date First Observed'==0,'Vehicle Expiration Date'==88888888,88888888.0,88880088},infer_datetime_format=True)
In [3]:
df.dtypes #Finding out if our datatype import has worked or not.
Out[3]:
In [4]:
df.columns #Identifying all the columns of the dataframe.
Out[4]:
In [5]:
## WRITING A FUNCTION TO CONVERT STRING INTO A DATE
def string_to_date(string):
from dateutil import parser
if pd.isnull(string):
return None
else:
dt = parser.parse(string)
return dt.date()
In [6]:
string_to_date('20160808') #Testing it out.
Out[6]:
In [7]:
df['Date First Observed']= df['Date First Observed'].apply(string_to_date) #Applying it to the dataframe column
In [8]:
def string_to_time(string): #CONVERTING THE VIOLATION TIME STRING INTO TIME
from dateutil import parser
import re
if pd.isnull(string):
return None
if string =='Nan' or string =='nan' or string =='0':
return None
if string[0:4].isnumeric()==True:
if int(string[0:2]) <=12 and int(string[2:4])<=59:
regex=re.search(r"\d\d\d\d[AaPp]",string)
if regex:
time=string[0:2]+":"+string[2:4]+" "+string[4]+"M"
return parser.parse(time).time()
else:
return None
In [9]:
def float_to_int(float): #THE VIOLATION CODE COLUMN IS ACTUALLY A FLOAT, SO WE'RE CONVERTING INTO AN INTEGER FIRST.
if pd.isnull(float):
return None
else:
return int(float)
In [10]:
df['Violation Code']=df['Violation Code'].apply(float_to_int)
In [11]:
df['Violation Time']=df['Violation Time'].apply(string_to_time)
In [12]:
df['Violation Time'].head() #TESTING IF IT WORKED
Out[12]:
In [13]:
df['Vehicle Color'].value_counts() #LOOKING AT COLORS IN THE ORIGINAL DATASET
Out[13]:
In [14]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['WHT', 'WH','WT','WHI'], 'WHITE')
In [15]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['GRAY', 'GY','GRY'], 'GREY')
In [16]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['BLK', 'BK'], 'BLACK')
In [17]:
df['Vehicle Color'] = df['Vehicle Color'].replace('BL', 'BLUE')
In [18]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['BR', 'BN','BRWN'], 'BROWN')
In [19]:
df['Vehicle Color'] = df['Vehicle Color'].replace('RD', 'RED')
In [20]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['GR', 'GN','GRN'], 'GREEN')
In [21]:
df['Vehicle Color'] = df['Vehicle Color'].replace('TN', 'TAN')
In [22]:
df['Vehicle Color'] = df['Vehicle Color'].replace('GL', 'GOLD')
In [23]:
df['Vehicle Color'] = df['Vehicle Color'].replace('BRN', 'BROWN')
In [24]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['YW', 'YELLO','YELL'], 'YELLOW')
In [25]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['BL', 'BLU'], 'BLUE')
In [26]:
df['Vehicle Color'] = df['Vehicle Color'].replace('MR', 'MAROON')
In [27]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['SIL', 'SILVR','SILVE','SILV'], 'SILVER')
In [28]:
df['Vehicle Color'] = df['Vehicle Color'].replace(['OR', 'ORANG'], 'ORANGE')
In [29]:
df['Vehicle Color'].value_counts() #LOOKING AT THE COLORS AFTER CLEANING UP, MUCH BETTER!
Out[29]:
In [30]:
#READING PARKING VIOLATION CODES FROM NYC DATASET
codesdf=pd.read_csv("DOF_Parking_Violation_Codes.csv",dtype={'Manhattan\xa0 96th St. & below':int, 'All Other Areas': int})
In [31]:
codesdf.columns
Out[31]:
In [32]:
codesdf['Manhattan\xa0 96th St. & below'].value_counts() #CHECKING IF THE IMPORT WORKED WELL
Out[32]:
In [33]:
newdf=df.join(codesdf, on='Violation Code', how='left') #MERGING THE TWO DATAFRAMES INTO A NEW DATAFRAME Unimaginatively CALLED NEWDF
In [34]:
newdf.head() #DID IT WORK? SEEMS TO HAVE, THE NANS IN THE END OF THIS FRAME ARE THERE IN THE ORIGINAL ONE TOO.
Out[34]:
In [35]:
newdf.columns #THE NEW COLUMNS SEEM TO HAVE BEEN ADDED
Out[35]:
In [36]:
newdf['Violation County'].value_counts() #LET US SEE THE VIOLATION COUNTY DATA, THIS NEEDS TO BE CLEANED UP A BIT
Out[36]:
In [37]:
newdf['Violation County'] = newdf['Violation County'].replace('BX', 'BRONX')
In [38]:
newdf['Violation County'] = newdf['Violation County'].replace(['R', 'RICH'], 'RICHMOND')
In [39]:
newdf['Violation County'] = newdf['Violation County'].replace('K', 'KINGS')
In [40]:
newdf['Violation County'].value_counts() #ALL CLEANED
Out[40]:
Now, we know the locations of the violation and from the merged dataset we know that there the fine rates vary upon the location of the violation, with certain rates for everything under 96th street in Manhattan and other rates for elsewhere. So using the Violation County numbers, and the assumption that NY county is everything under 96th - we calculate.
In [41]:
manhattanviolations=newdf[newdf['Violation County']=='NY']
In [42]:
#FINES COLLECTED IN MANHATTAN
manhattanviolations['Manhattan\xa0 96th St. & below'].sum()
Out[42]:
In [43]:
allotherviolations=newdf[newdf['Violation County']!= 'NY']
In [44]:
#FINES NOT COLLECTED IN MANHATTAN
allotherviolations['All Other Areas'].sum()
Out[44]:
In [45]:
#FINES COLLECTED IN ALL OF NYC
manhattanviolations['Manhattan\xa0 96th St. & below'].sum()+allotherviolations['All Other Areas'].sum()
Out[45]:
In [46]:
newdf['Manhattan\xa0 96th St. & below'].value_counts()
#IT IS EVIDENT THAT THE 265 DOLLARS IS THE MOST LUCRATIVE FINE in terms of monetary amount, but 115 DOlLARS IS THE MOST COMMON
Out[46]:
In [47]:
lucrative=newdf[newdf['Manhattan\xa0 96th St. & below']==265]
In [48]:
lucrative['DEFINITION'].value_counts()
#TRACTOR TRAILER PARKING SEEMS TO MAKE MONIES
Out[48]:
In [49]:
frequent=newdf['DEFINITION'].value_counts()
#IDENTIFYING THE MOST FREQUENT SETS OF VIOLATIONS
In [50]:
frequent.head() #THE TOP FIVE SETS OF VIOLATIONS
Out[50]:
In [51]:
nonnyviolations=newdf[newdf['Registration State']!= 'NY']
In [52]:
nonnymanhattanviolations=nonnyviolations[nonnyviolations['Violation County']=='NY']
In [53]:
nonnymanhattanviolations['Manhattan\xa0 96th St. & below'].sum()
#MONEY MADE BY NON NY VEHICLES IN MANHATTAN
Out[53]:
In [54]:
nonnymanhattanviolations=nonnyviolations[nonnyviolations['Violation County']!='NY']
In [55]:
nonnymanhattanviolations['All Other Areas'].sum()
#MONEY MADE BY NON NY VEHICLES OUTSIDE MANHATTAN
Out[55]:
In [56]:
nonnymanhattanviolations['Manhattan\xa0 96th St. & below'].sum()+nonnymanhattanviolations['All Other Areas'].sum()
#TOTAL MONEY MADE BY NON NY VEHICLES
Out[56]:
In [57]:
import matplotlib.pyplot as plt
%matplotlib inline
In [58]:
plt.style.use('ggplot')
toptenviolations=nonnyviolations['Violation Code'].value_counts()
toptenviolations.head(10).plot(kind='bar')
#I USE THE VIOLATION CODE INSTEAD OF ACTUAL VIOLATION NAME BECAUSE THE NAMES OF SOME OF THEM ARE TOO LONG.
Out[58]:
We can then inspect what 21,38,14 violation codes are.
In [59]:
newdf['Violation Time'].value_counts().plot(figsize=(16, 8))
Out[59]:
From the graph above, it is very evident that most people get tickets in the 6 AM to 12 PM ie the morning slot. More than any other segments of the day.
In [60]:
newdf['Manhattan\xa0 96th St. & below'].describe()
#AVERAGE COST OF TICKETS IN MANHATTAN is 95 DOLALRS
Out[60]:
In [61]:
newdf['All Other Areas'].describe()
#AVERAGE COST OF TICKET ELSEWHERE IS 85 DOLLARS
Out[61]:
In [62]:
import numpy
a=[84.716996,94.962425]
numpy.mean(a)
#SO THE AVERAGE TICKET COST IN NEW YORK CITY IS ABOUT 90 DOLLARS
Out[62]:
In [63]:
ticketsperday=newdf['Issue Date'].value_counts()
In [64]:
ticketsperday.head(100).plot(figsize=(16, 8))
Out[64]:
Looks like, the month of August is when the Police seem to have gone on a fining spree.
<< Not too sure on how to get this done>>
In [84]:
driversdf=pd.read_csv("drivers.csv")
In [85]:
driversdf.head(6)
Out[85]:
In [91]:
#MAKING DATAFRAMES FOR EACH OF THESE BOROUGHS/COUNTIES
bronxviolations=newdf[newdf['Violation County']=='BRONX']
brooklynviolations=newdf[newdf['Violation County']=='KINGS']
manhattanviolations=newdf[newdf['Violation County']=='NY']
queensviolations=newdf[newdf['Violation County']=='Q']
statenislandviolations=newdf[newdf['Violation County']=='RICHMOND']
In [97]:
#GETTING THE TOTAL NUMBER OF DRIVERS FROM THE NEW DATA FRAME
numberofdriversinbronx=driversdf.iloc[0]['Total']
numberofdriversinbrooklyn=driversdf.iloc[1]['Total']
numberofdriversinmanhattan=driversdf.iloc[2]['Total']
numberofdriversinqueens=driversdf.iloc[3]['Total']
numberofdriversinstatenisland=driversdf.iloc[4]['Total']
In [98]:
bronxviolations['All Other Areas'].sum()/numberofdriversinbronx #AVERAGE PARKING TICKET PER DRIVER IN BRONX
Out[98]:
In [99]:
brooklynviolations['All Other Areas'].sum()/numberofdriversinbrooklyn #AVERAGE PARKING TICKET PER DRIVER IN BROOKLYN
Out[99]:
In [100]:
manhattanviolations['Manhattan\xa0 96th St. & below'].sum()/numberofdriversinmanhattan #AVERAGE PARKING TICKET PER DRIVER IN MANHATTAN
Out[100]:
In [101]:
queensviolations['All Other Areas'].sum()/numberofdriversinqueens #AVERAGE PARKING TICKET PER DRIVER IN QUEENS
Out[101]:
In [102]:
statenislandviolations['All Other Areas'].sum()/numberofdriversinstatenisland #AVERAGE PARKING TICKET PER DRIVER IN STATEN ISLAND
Out[102]: