In [1]:
import pandas as pd
1. I want to make sure my Plate ID is a string. Can't lose the leading zeroes!
In [2]:
#As the datatypes in all of the columns vary, I decided to to make all the values, except for the ones I specify
#into str. This also takes care of questions 1. -> dtype=str
2. I don't think anyone's car was built in 0AD. Discard the '0's as NaN.
In [3]:
# The syntax for this is really very nice and clear, an example na_values= {'Vehicle Year' : ['0']}
# A lot more here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
3. I want the dates to be dates! Read the read_csv documentation to find out how to make pandas automatically parse dates.
In [4]:
#parse_dates=True,keep_date_col = True
import datetime
import datetime as dt
dt.datetime.strptime('08/04/2013', '%m/%d/%Y')
datetime.datetime(2013, 8, 4, 0, 0)
parser = lambda date: pd.datetime.strptime(date, '%m/%d/%Y')
#taken from here: http://stackoverflow.com/questions/23797491/parse-dates-in-pandas
#adding this, column counts starts at 0, without index, parse_dates=[4], date_parser=parser
#nrows=100000,
In [5]:
df = pd.read_csv("violations.csv", na_values= {'Vehicle Year' : ['0']}, parse_dates=[4], date_parser=parser, dtype=str)
In [6]:
df.head()
Out[6]:
In [7]:
#df.info()
4. "Date first observed" is a pretty weird column, but it seems like it has a date hiding inside. Using a function with .apply, transform the string (e.g. "20140324") into a Python date. Make the 0's show up as NaN.
In [8]:
#df['Date First Observed'].value_counts()
In [9]:
import dateutil.parser
In [10]:
def date_transfer(x):
x = str(x)
if x == '0':
return 'NaN'
try:
date = dateutil.parser.parse(x)
return str(date.strftime('%d/%m/%Y'))
except:
return 'NaN'
# dt.datetime.strptime('20130719', '%Y%m%d')
# datetime.datetime(2013, 7, 19, 0, 0)
# year_month_tag = dateutil.parser.parse(x)
return x
print(date_transfer(20151118))
In [11]:
df['Date First Observed'].apply(date_transfer).head()
Out[11]:
5. "Violation time" is... not a time. Make it a time.
In [12]:
#adding n_values = {'Violation Time' : ['0']}
#df = pd.read_csv("violations.csv", na_values= {'Vehicle Year' : ['0'], 'Violation Time' : ['0']}, parse_dates=[4], date_parser=parser, dtype=str)
In [13]:
df['Violation Time'].head(5)
Out[13]:
In [14]:
#df['Violation Time'].value_counts()
In [15]:
import dateutil
def pmam(x):
x = str(x)
#x = (':'.join(a+b for a,b in zip(x[::2], x[1::2])))
if x == 'NaN':
pass
try:
x = str(x[:2] + ':' + x[2:])
date = dateutil.parser.parse(x)
return str(date.strftime('%H:%M %p'))
except:
return 'NaN'
print(pmam('NaN'))
print(pmam("0752A"))
print(pmam("1240P"))
print(pmam("0232P"))
In [16]:
df['Violation Time pmam'] = df['Violation Time'].apply(pmam)
6. There sure are a lot of colors of cars, too bad so many of them are the same. Make "BLK" and "BLACK", "WT" and "WHITE", and any other combinations that you notice.
In [17]:
df['Vehicle Color'].head(10)
Out[17]:
In [18]:
def black_and_white(car_color):
car_color = str(car_color)
if car_color == 'NaN':
return 'NaN'
elif car_color == 'WHITE':
return car_color
elif car_color == 'BLACK':
return car_color
else:
return car_color.replace("BK","BLACK").replace("WH", "WHITE")
In [19]:
def rainbow(car_color):
car_color = str(car_color)
if car_color == 'NaN':
return car_color
elif car_color == 'BROWN':
return car_color
elif car_color == 'BLUE':
return car_color
elif car_color == 'RED':
return car_color
elif car_color == 'GREY':
return car_color
elif car_color == 'GREEN':
return car_color
else:
return car_color.replace("BR","BROWN").replace("RD", "RED").replace("GY", "GRAY").replace("TN", "TAN").replace("BLU", "BLUE").replace("GN", "GREEN")
In [20]:
df['Vehicle Color'].apply(black_and_white).apply(rainbow).head(5)
Out[20]:
7.Join the data with the Parking Violations Code dataset from the NYC Open Data site.
In [21]:
DOF_Parking_Violation_Codes_df = pd.read_csv("Parking_Fine_Codes_Man_Quota.csv")
DOF_Parking_Violation_Codes_df.head()
Out[21]:
In [22]:
def make_code_int(code):
return int(code)
In [23]:
df['Violation Code int'] = df['Violation Code'].apply(make_code_int)
In [24]:
df_merged = df.merge(DOF_Parking_Violation_Codes_df, left_on='Violation Code int', right_on='CODE')
In [25]:
df_merged.head(1)
Out[25]:
8. How much money did NYC make off of parking violations?
In [26]:
Mininum_Amount = df_merged['All Other Areas'].sum()
In [27]:
Mininum_Amount
Out[27]:
In [28]:
print("NYC made at least: ${:,.2f} off parking fines.".format(Mininum_Amount))
9. What's the most lucrative kind of parking violation? The most frequent?
In [29]:
def def_strip(definition):
return definition.strip()
In [30]:
df_merged['DEFINITION_stripped'] = df_merged['DEFINITION'].apply(def_strip)
In [31]:
#Most frequent kind of parking violation
df_merged['DEFINITION'].value_counts().head(1)
Out[31]:
In [32]:
count_by_violation_code = df_merged['CODE'].value_counts()
In [33]:
count_by_violation_code.head()
Out[33]:
In [34]:
df_merged.groupby('CODE')['All Other Areas'].sum().sort_values(ascending=False).head()
Out[34]:
In [35]:
Most_lucrative_amount = df_merged.groupby('CODE')['All Other Areas'].sum().sort_values(ascending=False).head(1)
Most_lucrative_amount
Out[35]:
In [36]:
Most_lucrative_fine_in_words = df_merged[df_merged['CODE'] == 14]
In [37]:
Most_lucrative_fine_in_words = Most_lucrative_fine_in_words['DEFINITION'].head(1)
Most_lucrative_fine_in_words
Out[37]:
10. New Jersey has bad drivers, but does it have bad parkers, too? How much money does NYC make off of all non-New York vehicles?
In [38]:
df_merged['Registration State'].value_counts().head(10)
Out[38]:
In [39]:
Fines_for_New_York_Cars = df_merged.groupby('Registration State')['All Other Areas'].sum().sort_values(ascending=False).head(1)
In [40]:
Fines_for_New_York_Cars
Out[40]:
In [41]:
Mininum_Amount #Total fines
Out[41]:
In [66]:
print("New York City makes ${:,.2f} off non-New-York cars.".format(Mininum_Amount - 221829545.0))
In [43]:
#Double Check:
df_merged[df_merged['Registration State'] != 'NY'].sum()
Out[43]:
11. Make a chart of the top few.
In [44]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
In [45]:
df_merged['Registration State'].value_counts().sort_values(ascending=True).tail(10).plot(kind='barh')
Out[45]:
12. What time of day do people usually get their tickets? You can break the day up into several blocks - for example 12am-6am, 6am-12pm, 12pm-6pm,6pm-12am.
In [46]:
import dateutil
def Time_of_day(x):
x = str(x)
#x = (':'.join(a+b for a,b in zip(x[::2], x[1::2])))
if x == 'NaN':
pass
try:
x = str(x[:2] + ':' + x[2:])
date = dateutil.parser.parse(x)
time = int(str(date.strftime('%H')))
if time <= 6:
return '12am - 6am'
elif time <= 12:
return '6am - 12pm'
elif time <= 18:
return '12pm - 6pm'
elif time <= 24:
return '6pm - 12am'
except:
return 'NaN'
In [47]:
df_merged['Time of Day'] = df['Violation Time'].apply(Time_of_day)
In [48]:
df_merged['Time of Day'].value_counts()
Out[48]:
14. Make a graph of the number of tickets per day.
In [49]:
df_merged['Issue Date'].head()
Out[49]:
In [50]:
#Filtering out all the dates that were in the future and before 2013, obviously wrong dates.
df_merged_only_2013_dates = df_merged[(df_merged['Issue Date'] > '2013-01-01') & (df_merged['Issue Date'] < '2013-12-31')]
In [51]:
#More on histograms
#http://stackoverflow.com/questions/27365467/python-pandas-plot-histogram-of-dates
#http://stackoverflow.com/questions/29672375/histogram-in-matplotlib-time-on-x-axis
#plt.axis('off')
import matplotlib.pyplot as plt
plt.plot(range(5))
plt.gca().xaxis.set_major_locator(plt.NullLocator())
df_merged_only_2013_dates.groupby('Issue Date')['Issue Date'].count().plot(kind="bar", figsize=(15, 6), legend=None, title='Amount of daily parking tickets in 2013')
plt.show()
15. Make a graph of the amount of revenue collected per day.
In [52]:
df_merged['Issue Date'].head()
Out[52]:
In [53]:
plt.plot(range(5))
plt.gca().xaxis.set_major_locator(plt.NullLocator())
df_merged_only_2013_dates.groupby('Issue Date')['All Other Areas'].sum().plot(kind="bar", figsize=(15, 6), legend=None, title='Total daily fine amount in 2013')
plt.show()
16. Manually construct a dataframe out of https://dmv.ny.gov/statistic/2015licinforce-web.pdf (only NYC boroughts - bronx, queens, manhattan, staten island, brooklyn), having columns for borough name, abbreviation, and number of licensed drivers.
In [54]:
df_boroughs = pd.read_csv("NYC_Boroughs3.csv")
In [55]:
df_boroughs.head()
Out[55]:
17. What's the parking-ticket-$-per-licensed-driver in each borough of NYC? Do this with pandas and the dataframe you just made, not with your head!
In [56]:
def county_harm(county):
county = str(county)
if county == 'NaN':
return
else:
return county.replace("BRONX","BX").replace("KINGS", "K").replace("NYC", "NY").replace("QUEEN", "Q").replace("RC", "R").replace("RICH", "R")
In [57]:
df_merged['Violation County harm'] = df_merged['Violation County'].apply(county_harm)
In [58]:
summed_fine_amounts_by_county = df_merged.groupby('Violation County harm')['All Other Areas'].sum()
In [59]:
#creating dataframe
fine_amounts = pd.DataFrame(summed_fine_amounts_by_county)
In [60]:
#changing the index into column
fine_amounts.reset_index(inplace=True)
In [61]:
fine_amounts
Out[61]:
In [62]:
#Joining the two new datasets
df_ticket_dollars_per_licence = fine_amounts.merge(df_boroughs, left_on='Violation County harm', right_on='abbreviation')
In [63]:
df_ticket_dollars_per_licence
Out[63]:
In [64]:
df_ticket_dollars_per_licence['Ticket Dollars per Licence'] = df_ticket_dollars_per_licence['All Other Areas'] / df_ticket_dollars_per_licence['Total']
In [65]:
#Dividing one dataset by the other
df_ticket_dollars_per_licence[['borough name', 'abbreviation', 'Ticket Dollars per Licence']]
Out[65]: