In [396]:
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
In [397]:
col_types = "str"
df = pd.read_csv("mini_violations.csv", na_values={'Vehicle Year':"0", 'Date First Observed': "0", 'Vehicle Expiration Date': "0"}, low_memory=False, dtype=col_types, parse_dates=["Issue Date"])
#giving parse_time also "Vehicle Expiration Date" as a element, it did not convert it, thus left out and dealt with with .apply later
In [398]:
df.columns
Out[398]:
In [399]:
df.info()
In [400]:
print(df['Issue Date'][0])
In [401]:
df['Violation County'].value_counts()
Out[401]:
In [402]:
df.dtypes
Out[402]:
--> Already done in the first step: "Make the 0's show up as NaN"
As "Vehicle Expiration Date" has the same format as "Date First observed" will also convert it with the same method
In [403]:
df["Date First Observed"].head(40)
#format "Date First Observed": 20130719
type(df["Date First Observed"])
Out[403]:
In [404]:
import dateutil
from dateutil import parser
#https://dateutil.readthedocs.io/en/stable/parser.html
#dateutil.parser.parse(timestr, parserinfo=None)
In [405]:
#WHY IS IT A FLOAT? I DEFINED ALL COLUMNS TO BE STR!?
type(df["Date First Observed"][0])
Out[405]:
In [406]:
def date_str_to_date(timestr):
convert_date_str = str(timestr)
try:
return dateutil.parser.parse(convert_date_str)
except:
return convert_date_str
In [407]:
df['date_first_observed'] = df["Date First Observed"].apply(date_str_to_date)
In [408]:
#format "Vehicle Expiration Date": 20140831
df['vehicle_expiration'] = df["Vehicle Expiration Date"].apply(date_str_to_date)
In [409]:
import time
def changing_time(messy_time):
a = messy_time[0:2]
b = messy_time[-3:-1]
c = messy_time[-1:]
d = a +":"+ b + c + "M"
#return d
try:
stripped_time = dateutil.parser.parse(d).time()
return stripped_time
except:
return d
df['violation_time'] = df["Violation Time"].astype(str).apply(changing_time)
In [410]:
type(df['violation_time'][0])
Out[410]:
In [411]:
#df["Vehicle Color"].value_counts()
df['vehicle_color'] = df["Vehicle Color"].replace(["WH", "WT", "WHT", "WHI"], "WHITE").replace(["BK", "BLK"], "BLACK").replace(["GY", "GRY", "GRAY"], "GREY").replace(["BL", "BLU"], "BLUE").replace("RD", "RED").replace(["OR","ORANG"], "ORANGE").replace(["GRN", "GR", "GN"], "GREEN").replace(["BR", "BRN", "BN", "BRWN"], "BROWN").replace(["SILVE", "SILVR", "SIL", "SL"],"SILVER").replace("GL", "GOLD").replace(["YELLO","YW"],"YELLOW").replace("TN", "TAN")
#TRIED FUNCTION AND LIST COMPREHENSION AND FOR LOOP, ALL THREW ERRORS AT SOME POINT
#def change_color(wrongspell, rightspell):
# newdf = df[df["Vehicle Color"].replace(wrongspell, rightspell)]
# return newdf
#change_color("WH", "WHITE")
#change_color("WHT", "WHITE")
#change_color("WT", "WHITE")
#wrong_spellings = ["WH", "WT", "WHT"]
#for spelling in wrong_spellings:
# newdf["Vehicle Color"] = df["Vehicle Color"].replace(spelling, "WHITE")
#[df["Vehicle Color"].replace(spelling, "WHITE") for spelling in wrong_spellings]
In [412]:
df.columns
Out[412]:
In [413]:
# selecting the columns needed to answer the remaining questions
# above manipulated columns vehicle_color and date_first_observed not included, as not needed
selection= df[['Summons Number', 'Plate ID', 'Registration State', 'Violation County', 'Issue Date', 'violation_time', 'Violation Code']]
In [414]:
# make table column names look neater/having the same structure (no blank spaces within name!!!)
selection.columns = ['summons_number', 'plate_id', 'registration_state', 'violation_county', 'issue_date', 'violation_time', 'violation_code']
In [415]:
selection
Out[415]:
In [416]:
# in atom I replaced all the $ by nothing -- seemed more time efficient than doing it in pandas ;)
vc_set = pd.read_csv("DOF_Parking_Violation_Codes.csv", dtype="str")
vc_set.columns = ["code", "definition", "manhattan_below_96", "all_other_areas"]
vc_set
Out[416]:
In [417]:
combined_df = selection.merge (vc_set, left_on='violation_code', right_on='code')
In [418]:
combined_df.columns
Out[418]:
In [419]:
combined_df
Out[419]:
In [420]:
all_fines = combined_df["manhattan_below_96"]
all_list = all_fines.sum().split(" ")
cleaned_fines =[]
for fine in all_list:
try:
int_fine = int(fine)
cleaned_fines.append(int_fine)
except:
continue
money_nyc = sum(cleaned_fines)
print("NYC makes at least", money_nyc, " US-Dollar off parking violations.")
In [421]:
#most lucrative:
combined_df["manhattan_below_96"].value_counts()
Out[421]:
In [422]:
combined_df["all_other_areas"].value_counts()
Out[422]:
In [423]:
combined_df[combined_df["all_other_areas"] == "200 (Heavy Tow, plus violation fine)"]
Out[423]:
The highest fine of 200 Dollar is given outside of Manhattan and is associated with the "Violation Code" of 46
In [424]:
#to find out row index of Violation code 46:
vc_set['code'].head(40)
#find out the definition of
vc_set.loc[31]["definition"]
Out[424]:
In [425]:
#most frequent:
combined_df['violation_code'].describe()
Out[425]:
Most freqent is "Violation code" 21, occurring with a frequency of 387
In [426]:
#to find out row index of Violation code 21:
#vc_set['code']
#find out the definition of
vc_set.loc[10]["definition"]
Out[426]:
Thoughts: 1) Group by Registration State 2) sum up fare for all_other_areas
In [427]:
all_non_ny = combined_df[combined_df["registration_state"] != 'NY']
fines_non_ny = all_non_ny['all_other_areas']
fine_list = []
for fine in fines_non_ny:
clean_fine = fine.replace(' (Heavy Tow, plus violation fine)', '').replace('vary','')
try:
fine_list.append(int(clean_fine))
except:
continue
sum_fines = sum(fine_list)
print("NYC makes at least", sum_fines, "US-Dollar from vehicles that are not registered in NY")
In [428]:
def cleaning(value):
if "(" in value:
return 100
if "vary" in value:
return 2
else:
return int(value)
In [429]:
combined_df['manhattan_cleaned_fines'] = combined_df['manhattan_below_96'].apply(cleaning)
In [430]:
combined_df.groupby(by='registration_state')['manhattan_cleaned_fines'].sum().sort_values().plot(kind='barh', figsize=(20,10))
Out[430]:
In [431]:
type(combined_df['violation_time'][0])
Out[431]:
In [432]:
print(combined_df['violation_time'][0])
In [433]:
combined_df['violation_time'][0].hour
Out[433]:
In [434]:
def time_to_hour(value):
try:
return value.hour
except:
pass
In [435]:
combined_df["violation_hour"] = combined_df['violation_time'].apply(time_to_hour)
combined_df.head(3)
Out[435]:
In [448]:
combined_df.groupby('violation_hour').count()['summons_number'].plot(kind="bar", figsize=(20,10))
Out[448]:
In [452]:
combined_df['violation_hour'].hist(bins=4)
Out[452]:
In [303]:
number_of_all_tickets = combined_df['manhattan_below_96'].count()
number_of_all_tickets
Out[303]:
In [304]:
# taking the total money NYC makes from question 8 -- stored in variable "money_nyc"
average_ticket_cost = money_nyc/number_of_all_tickets
print("The average ticket cost in NYC is", round(average_ticket_cost), "US-dollar.")
In [305]:
combined_df.groupby('issue_date').count()
combined_df.groupby('issue_date').count()['summons_number']
plt.style.use('ggplot')
combined_df.groupby('issue_date').count()['summons_number'].plot(kind='bar', figsize=(20,10))
Out[305]:
In [306]:
#groupby date, sum revenue
In [307]:
copy_df = combined_df
copy_df.index = copy_df['issue_date']
copy_df.groupby("issue_date")['manhattan_cleaned_fines'].sum().plot(kind='bar', figsize=(20,10))
Out[307]:
In [308]:
type(dates_frame['issue_date'][0])
Out[308]:
In [309]:
dates_frame
Out[309]:
In [310]:
dates_frame = combined_df[combined_df['issue_date'] == combined_df['issue_date']]
rashida = dates_frame.groupby('issue_date')
rashida.head()
Out[310]:
In [311]:
dates_frame = combined_df[combined_df['issue_date'] == combined_df['issue_date']]
by_date = dates_frame.groupby('issue_date')
type(by_date)
[date[0] for date in by_date["issue_date"]]
# [date for date in dates_frame["issue_date"]]
Out[311]:
In [312]:
# group by date, sum fines per day
dates_frame = combined_df[combined_df['issue_date'] == combined_df['issue_date']]
by_date = dates_frame.groupby('issue_date')
by_dates_list = []
for value in by_date['manhattan_below_96'].sum():
cleaned_value_list = value.replace("(Regular Tow, plus violation fine)", "").replace("vary", "").split()
revenue_int_list = [int(value) for value in cleaned_value_list]
revenue_sum = sum(revenue_int_list)
individual_date = [date[0] for date in by_date["issue_date"]]
date_dict={'date': "individual_date", 'revenue': revenue_sum}
by_dates_list.append(date_dict)
by_dates_list
Out[312]:
In [313]:
by_dates_df = pd.DataFrame(by_dates_list)
by_dates_df.plot(kind='bar', figsize=(20,10))
Out[313]:
In [314]:
licensed_drivers = pd.read_csv("licensed_drivers.csv")
selection_licensed = licensed_drivers.tail(5)[["County", "Total"]]
selection_licensed
Out[314]:
In [315]:
abbreviation = [{'state': "Staten Island", 'abbreviation': "K"}, {'state': "Brooklyn", 'abbreviation': "R"}, {'state':"Queens", 'abbreviation':"Q"}, {'state':"Bronx", 'abbreviation': "BX"}, {'state':"New York", 'abbreviation':"NY"}]
abbrev_table = pd.DataFrame(abbreviation)
abbrev_table
Out[315]:
In [316]:
joined_table = abbrev_table.merge(selection_licensed, left_on="state", right_on="County")
licensed_df = joined_table[['County', 'abbreviation', 'Total']]
licensed_df.columns = ("borough name", "abbreviation", "number of licensed drivers")
licensed_df
Out[316]:
In [317]:
#1) Get total violation cost per borough
#2) Divide it by number of licensed drivers
In [318]:
df_17 = combined_df[['summons_number', 'violation_county', 'manhattan_below_96', 'all_other_areas']]
df_17
Out[318]:
In [319]:
fine_dict_list = []
def get_fines(a,b):
viol = df_17[df_17['violation_county'] == a]
fines = viol[b]
fines_list = []
for fine in fines:
try:
fines_list.append(int(fine))
except:
continue
fine_per_borough = sum(fines_list)
dict_fines = {'state': a, 'total_fines_in_borough': fine_per_borough}
fine_dict_list.append(dict_fines)
fines_per_borough = fine_dict_list
get_fines('NY', 'manhattan_below_96')
get_fines('K', 'all_other_areas')
get_fines('BX', 'all_other_areas')
get_fines('R', 'all_other_areas')
get_fines('Q', 'all_other_areas')
In [320]:
fines_per_borough
Out[320]:
In [321]:
borough_fines = pd.DataFrame(fines_per_borough)
In [322]:
joined_df = licensed_df.merge(borough_fines, left_on="abbreviation", right_on="state")
joined_df
Out[322]:
In [323]:
joined_df['parking-ticket-cost-per-licensed-driver'] = joined_df['total_fines_in_borough'] / joined_df['number of licensed drivers']
In [324]:
joined_df
Out[324]:
Either my calculation went wrong somewhere or the fines are collected just by a very few drivers.
In [ ]: