In [1]:
import pandas as pd
1. I want to make sure my Plate ID is a string. Can't lose the leading zeroes!
In [2]:
#As the datatypes in all of the columns vary, I decided to to make all the values, except for the ones I specify
#into str. This also takes care of questions 1. -> dtype=str
2. I don't think anyone's car was built in 0AD. Discard the '0's as NaN.
In [3]:
# The syntax for this is really very nice and clear, an example na_values= {'Vehicle Year' : ['0']}
# A lot more here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
3. I want the dates to be dates! Read the read_csv documentation to find out how to make pandas automatically parse dates.
In [4]:
#parse_dates=True,keep_date_col = True
import datetime
import datetime as dt
dt.datetime.strptime('08/04/2013', '%m/%d/%Y')
datetime.datetime(2013, 8, 4, 0, 0)
parser = lambda date: pd.datetime.strptime(date, '%m/%d/%Y')
#taken from here: http://stackoverflow.com/questions/23797491/parse-dates-in-pandas
#adding this, column counts starts at 0, without index, parse_dates=[4], date_parser=parser
#nrows=100000,
In [5]:
df = pd.read_csv("violations.csv", na_values= {'Vehicle Year' : ['0']}, parse_dates=[4], date_parser=parser, dtype=str)
In [6]:
df.head()
Out[6]:
In [7]:
#df.info()
4. "Date first observed" is a pretty weird column, but it seems like it has a date hiding inside. Using a function with .apply, transform the string (e.g. "20140324") into a Python date. Make the 0's show up as NaN.
In [8]:
#df['Date First Observed'].value_counts()
In [9]:
import dateutil.parser
In [10]:
def date_transfer(x):
x = str(x)
if x == '0':
return 'NaN'
try:
date = dateutil.parser.parse(x)
return str(date.strftime('%d/%m/%Y'))
except:
return 'NaN'
# dt.datetime.strptime('20130719', '%Y%m%d')
# datetime.datetime(2013, 7, 19, 0, 0)
# year_month_tag = dateutil.parser.parse(x)
return x
print(date_transfer(20151198))
In [11]:
df['Date First Observed'].apply(date_transfer).head()
Out[11]:
5. "Violation time" is... not a time. Make it a time.
In [12]:
#adding n_values = {'Violation Time' : ['0']}
#df = pd.read_csv("violations.csv", na_values= {'Vehicle Year' : ['0'], 'Violation Time' : ['0']}, parse_dates=[4], date_parser=parser, dtype=str)
In [13]:
df['Violation Time'].head(5)
Out[13]:
In [14]:
#df['Violation Time'].value_counts()
In [15]:
import dateutil
def pmam(x):
x = str(x)
#x = (':'.join(a+b for a,b in zip(x[::2], x[1::2])))
if x == 'NaN':
pass
try:
x = str(x[:2] + ':' + x[2:])
date = dateutil.parser.parse(x)
return str(date.strftime('%H:%M %p'))
except:
return 'NaN'
print(pmam('NaN'))
print(pmam("0752A"))
print(pmam("1240P"))
print(pmam("0232P"))
In [16]:
df['Violation Time pmam'] = df['Violation Time'].apply(pmam)
6. There sure are a lot of colors of cars, too bad so many of them are the same. Make "BLK" and "BLACK", "WT" and "WHITE", and any other combinations that you notice.
In [17]:
df['Vehicle Color'].head(10)
Out[17]:
In [18]:
def black_and_white(car_color):
car_color = str(car_color)
if car_color == 'NaN':
return 'NaN'
elif car_color == 'WHITE':
return car_color
elif car_color == 'BLACK':
return car_color
else:
return car_color.replace("BK","BLACK").replace("WH", "WHITE")
In [19]:
def rainbow(car_color):
car_color = str(car_color)
if car_color == 'NaN':
return car_color
elif car_color == 'BROWN':
return car_color
elif car_color == 'BLUE':
return car_color
elif car_color == 'RED':
return car_color
elif car_color == 'GREY':
return car_color
elif car_color == 'GREEN':
return car_color
else:
return car_color.replace("BR","BROWN").replace("RD", "RED").replace("GY", "GRAY").replace("TN", "TAN").replace("BLU", "BLUE").replace("GN", "GREEN")
In [20]:
df['Vehicle Color'].apply(black_and_white).apply(rainbow).head(5)
Out[20]:
7.Join the data with the Parking Violations Code dataset from the NYC Open Data site.
In [21]:
DOF_Parking_Violation_Codes_df = pd.read_csv("DOF_Parking_Violation_Codes.csv")
DOF_Parking_Violation_Codes_df.head()
Out[21]:
In [22]:
df_merged = df.merge(DOF_Parking_Violation_Codes_df, left_on='Violation Code', right_on='CODE')
8. How much money did NYC make off of parking violations?
In [23]:
def money_to_int(money_str):
if money_str == '200 (Heavy Tow plus violation fine)':
money_str_Heavy_Tow = money_str[:3]
return int(money_str_Heavy_Tow)
if money_str == '100\n(Regular Tow, plus violation fine)':
money_str_Heavy_Tow = money_str[:3]
return int(money_str_Heavy_Tow)
try:
return int(money_str.replace("$","").replace(",",""))
except:
return None
In [24]:
df_merged['All Other Areas int'] = df_merged['All Other Areas'].apply(money_to_int)
In [25]:
df_merged['Manhattan\xa0 96th St. & below int'] = df_merged['Manhattan\xa0 96th St. & below'].apply(money_to_int)
In [26]:
Mininum_Amount = df_merged['All Other Areas int'].sum()
In [27]:
print("NYC made at least: ${:,.2f} on parking fines.".format(Mininum_Amount))
9. What's the most lucrative kind of parking violation? The most frequent?
In [28]:
def def_strip(definition):
return definition.strip()
In [29]:
df_merged['DEFINITION_stripped'] = df_merged['DEFINITION'].apply(def_strip)
In [30]:
#Most frequent kind of parking violation
df_merged['DEFINITION'].value_counts().head(1)
Out[30]:
In [31]:
count_by_violation_code = df_merged['CODE'].value_counts()
In [32]:
count_by_violation_code.head()
Out[32]:
In [33]:
df_merged.groupby('CODE')['All Other Areas int'].sum().sort_values(ascending=False).head()
Out[33]:
In [34]:
Most_lucrative_amount = df_merged.groupby('CODE')['All Other Areas int'].sum().sort_values(ascending=False)[1]
In [35]:
Most_lucrative_fine_in_words = df_merged[df_merged['CODE'] == '14']
In [36]:
Most_lucrative_fine_in_words = Most_lucrative_fine_in_words['DEFINITION'].head(1)
In [37]:
print("The most lucrative fine regarded ${:,.2f} on parking fines.".format(Most_lucrative_amount))
print("It regarded " + Most_lucrative_fine_in_words)
10. New Jersey has bad drivers, but does it have bad parkers, too? How much money does NYC make off of all non-New York vehicles?
In [38]:
df_merged['Registration State'].value_counts().head(10)
Out[38]:
In [39]:
Fines_for_New_York_Cars = df_merged.groupby('Registration State')['All Other Areas int'].sum().sort_values(ascending=False).head(1)
In [40]:
Fines_for_New_York_Cars
Out[40]:
In [41]:
Mininum_Amount #Total fines
Out[41]:
In [42]:
print("New York City makes ${:,.2f} off non-New-York cars.".format(Mininum_Amount - 220943620.0))
In [43]:
#Double Check:
df_merged[df_merged['Registration State'] != 'NY'].sum()
Out[43]:
11. Make a chart of the top few.
In [44]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
In [45]:
df_merged['Registration State'].value_counts().sort_values(ascending=True).tail(10).plot(kind='barh')
Out[45]:
12. What time of day do people usually get their tickets? You can break the day up into several blocks - for example 12am-6am, 6am-12pm, 12pm-6pm,6pm-12am.
In [46]:
import dateutil
def Time_of_day(x):
x = str(x)
#x = (':'.join(a+b for a,b in zip(x[::2], x[1::2])))
if x == 'NaN':
pass
try:
x = str(x[:2] + ':' + x[2:])
date = dateutil.parser.parse(x)
time = int(str(date.strftime('%H')))
if time <= 6:
return '12am - 6am'
elif time <= 12:
return '6am - 12pm'
elif time <= 18:
return '12pm - 6pm'
elif time <= 24:
return '6pm - 12am'
except:
return 'NaN'
In [47]:
df_merged['Time of Day'] = df['Violation Time'].apply(Time_of_day)
In [48]:
df_merged['Time of Day'].value_counts()
Out[48]:
14. Make a graph of the number of tickets per day.
In [49]:
df_merged['Issue Date'].head()
Out[49]:
In [50]:
#Filtering out all the dates that were in the future and before 2013, obviously wrong dates.
df_merged_only_2013_dates = df_merged[(df_merged['Issue Date'] > '2013-01-01') & (df_merged['Issue Date'] < '2013-12-31')]
In [51]:
#More on histograms
#http://stackoverflow.com/questions/27365467/python-pandas-plot-histogram-of-dates
#http://stackoverflow.com/questions/29672375/histogram-in-matplotlib-time-on-x-axis
df_merged_only_2013_dates.groupby('Issue Date')['Issue Date'].count().plot(kind="bar", figsize=(15, 6))
Out[51]:
15. Make a graph of the amount of revenue collected per day.
In [52]:
df_merged['Issue Date'].head()
Out[52]:
In [53]:
df_merged_only_2013_dates.groupby('Issue Date')['All Other Areas int'].sum().plot(kind="bar", figsize=(15, 6))
Out[53]:
16. Manually construct a dataframe out of https://dmv.ny.gov/statistic/2015licinforce-web.pdf (only NYC boroughts - bronx, queens, manhattan, staten island, brooklyn), having columns for borough name, abbreviation, and number of licensed drivers.
In [54]:
df_boroughs = pd.read_csv("NYC_Boroughs.csv")
In [55]:
df_boroughs.head()
Out[55]:
17. What's the parking-ticket-$-per-licensed-driver in each borough of NYC? Do this with pandas and the dataframe you just made, not with your head!
In [56]:
df_merged['Violation County'].value_counts()
Out[56]:
In [57]:
df_merged.groupby('Violation County')['All Other Areas int'].sum()
Out[57]:
In [58]:
def boroughs_trans(borough):
borough = str(borough)
if borough == 'NaN':
pass
elif borough == 'RICH':
return 'R'
elif borough == 'BRONX':
return 'BX'
else:
return None