In [3]:
import pandas as pd
In [4]:
!head -n 1000 violations.csv > small-violations.csv
In [5]:
plate_info = {'Plate ID': 'str'}
df = pd.read_csv("small-violations.csv", dtype=plate_info)
In [105]:
df
Out[105]:
In [6]:
df.head()
Out[6]:
In [107]:
df.head(10)
Out[107]:
In [106]:
df.tail()
Out[106]:
In [7]:
plate_info = {'Plate ID': 'str'}
df = pd.read_csv("small-violations.csv", dtype=plate_info, na_values={'Vehicle Year': '0', 'Date First Observed': '0'})
df.head()
Out[7]:
In [19]:
import dateutil
def date_to_date(date):
date = str(date)
parsed_date = dateutil.parser.parse(date)
return parsed_date
In [20]:
df.columns
Out[20]:
In [21]:
df['New Issue Date']= df['Issue Date'].apply(date_to_date)
In [22]:
import datetime
def convert_to_time(time):
try:
str_time = str(time)
return datetime.datetime.strptime(str_time, "%Y%m%d")
except:
return None
In [23]:
other_df = df[df['Vehicle Expiration Date'] != 0]
other_df.head()
Out[23]:
In [24]:
other_df['New Vehicle Expiration Date']= other_df['Vehicle Expiration Date'].apply(convert_to_time)
In [26]:
other_df.head()
Out[26]:
In [27]:
other_df.columns
Out[27]:
In [28]:
other_df['Date First Observed'].dtypes
Out[28]:
In [29]:
other_df['Date First Observed'].tail()
Out[29]:
In [30]:
import dateutil
In [31]:
other_df['Date First Observed']
Out[31]:
In [32]:
other_df['Violation Time'].head()
Out[32]:
In [33]:
other_df['Violation Time'].tail()
Out[33]:
In [35]:
def int_to_date(integer):
if not pd.isnull(integer):
date = str(int(integer))
parsed_date = dateutil.parser.parse(date)
return parsed_date.strftime("%Y-%-m-%d")
In [36]:
other_df['Date First Observed'].apply(int_to_date)
Out[36]:
In [37]:
def violation_time_to_time(time):
try:
hour = time[0:2]
minutes = time[2:4]
am_pm= time[4]
regular_time= hour + ":" + minutes + " " + am_pm + 'm'
violation_time_fixed = dateutil.parser.parse(regular_time)
return violation_time_fixed.strftime("%H:%M%p")
except:
return None
In [38]:
other_df['Violation Time'].apply(violation_time_to_time)
Out[38]:
In [39]:
other_df['Vehicle Color'].value_counts()
Out[39]:
In [41]:
def color_rename(color):
if (color == 'BLACK') or (color == 'BLK') or (color == 'BK'):
return 'BLACK'
elif (color == 'WHITE') or (color == 'WHT') or (color == 'WH') or (color == 'W'):
return 'WHITE'
other_df['Vehicle Color'].apply(color_rename)
Out[41]:
In [42]:
parking_violations_df = pd.read_csv("DOF_Parking_Violation_Codes.csv", encoding="mac_roman", error_bad_lines=False)
parking_violations_df.head()
Out[42]:
In [43]:
parking_violations_df['CODE'].describe()
Out[43]:
In [44]:
other_df['Violation Code'].describe()
Out[44]:
In [57]:
def convert_to_str(n):
return str(n)
In [58]:
parking_violations_df['Code'] = parking_violations_df['CODE'].apply(convert_to_str)
In [59]:
other_df['Violation code'] = other_df['Violation Code'].apply(convert_to_str)
In [60]:
parking_violations_df.head()
Out[60]:
In [61]:
updated_parking_violations_df = parking_violations_df.rename(columns={'Manhattan  96th St. & below': 'Manhattan 96th & below', 'All Other Areas': 'All other areas'})
updated_parking_violations_df.head()
Out[61]:
In [62]:
other_df.head()
Out[62]:
In [63]:
diff_violations_df = pd.merge(other_df, updated_parking_violations_df, left_on='Violation code', right_on='Code')
diff_violations_df.head()
Out[63]:
In [64]:
diff_violations_df['Manhattan 96th & below'].describe()
Out[64]:
In [65]:
diff_violations_df['All other areas'].describe()
Out[65]:
In [66]:
diff_violations_df['Manhattan 96th & below'].apply(convert_to_str).head()
Out[66]:
In [68]:
diff_violations_df['All other areas'].apply(convert_to_str).head()
Out[68]:
In [69]:
diff_violations_df = new_violations_df[new_violations_df['Manhattan 96th & below'] != 'vary']
diff_violations_df.head()
Out[69]:
In [70]:
import re
def strip_and_convert_to_int(string):
match = re.findall(r"^\$?\d*", string)
if match:
new_string = string.replace("$", "").split()
new_int = int(new_string[0])
return new_int
else:
return None
In [71]:
diff_violations_df['Manhattan 96th and below'] = diff_violations_df['Manhattan 96th & below'].apply(strip_and_convert_to_int)
In [72]:
diff_violations_df.head()
Out[72]:
In [73]:
diff_violations_df['All Other Areas'] = diff_violations_df['All other areas'].apply(strip_and_convert_to_int)
diff_violations_df.tail()
Out[73]:
In [74]:
diff_violations_df['All Other Areas'].value_counts().head()
Out[74]:
In [75]:
manhattan_violations = diff_violations_df.groupby('Violation code')['All Other Areas'].sum()
manhattan_violations.sum()
Out[75]:
In [76]:
violations_not_man = diff_violations_df.groupby('Violation code')['Manhattan 96th and below'].sum()
violations_not_man.sum()
Out[76]:
In [77]:
violations_revenue = violations_not_man.sum() + manhattan_violations.sum()
In [78]:
violations_revenue
Out[78]:
In [79]:
manhattan_violations.sort_values(ascending=False)
Out[79]:
In [80]:
violations_not_man.sort_values(ascending=False)
Out[80]:
In [81]:
new_violations_df['Violation code'].value_counts()
Out[81]:
In [82]:
out_of_staters_df = diff_violations_df[diff_violations_df['Registration State'] != 'NY']
out_of_staters_df.head()
Out[82]:
In [83]:
out_of_staters_other = out_of_staters_df.groupby('Violation code')['All Other Areas'].sum()
out_of_staters_other.sum()
Out[83]:
In [84]:
out_of_staters_manhattan= out_of_staters_df.groupby('Violation code')['Manhattan 96th and below'].sum()
out_of_staters_manhattan.sum()
Out[84]:
In [85]:
total_out_of_staters_violations = out_of_staters_other.sum()+ out_of_staters_manhattan.sum()
total_out_of_staters_violations
Out[85]:
In [87]:
%matplotlib inline
In [88]:
out_of_staters_other.sort_values(ascending=False).plot(kind='bar', x='Violation code')
Out[88]:
In [89]:
out_of_staters_manhattan.sort_values(ascending=False).plot(kind='bar', x='Violation code')
Out[89]:
In [ ]:
In [91]:
average_tix_price = total_out_of_staters_violations / diff_violations_df['Violation code'].value_counts().sum()
average_tix_price
Out[91]:
In [92]:
diff_violations_df['Issue Date'].value_counts().head(10).plot(kind='barh')
Out[92]:
In [93]:
daily_revenue = total_out_of_staters_violations / new_violations_df['New Issue Date'].value_counts()
daily_revenue.sort_values(ascending=False).head(20).plot(kind='bar')
Out[93]:
In [95]:
nyc_licenses = pd.read_excel("NYC.xlsx")
nyc_licenses
In [98]:
diff_violations_df.columns
Out[98]:
In [99]:
diff_violations_df['Violation County'].value_counts()
Out[99]:
In [100]:
bronx_violations = diff_violations_df[diff_violations_df['Violation County'] == 'BX']
bronx_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'BX']
bronx_tix = bronx_violations.groupby('Violation code')['All Other Areas'].sum()
driver_bronx_tix = bronx_licenses / bronx_tix.sum()
driver_bronx_tix
In [101]:
queens_violations = diff_violations_df[diff_violations_df['Violation County'] == 'Q']
queens_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'Q']
queens_tix = queens_violations.groupby('Violation code')['All Other Areas'].sum()
driver_queens_tix = queens_licenses / queens_tix.sum()
driver_queens_tix
In [102]:
ny_violations = diff_violations_df[diff_violations_df['Violation County'] == 'NY']
ny_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'NY']
ny_tix = ny_violations.groupby('Violation code')['All Other Areas'].sum()
driver_ny_tix = ny_licenses / ny_tix.sum()
driver_ny_tix
In [103]:
brooklyn_violations = diff_violations_df[diff_violations_df['Violation County'] == 'R']
brooklyn_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'R']
brooklyn_tix = brooklyn_violations.groupby('Violation code')['All Other Areas'].sum()
driver_brooklyn_tix = brooklyn_licenses / brooklyn_tix.sum()
driver_brooklyn_tix
In [104]:
staten_is_violations = diff_violations_df[diff_violations_df['Violation County'] == 'K']
staten_is_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'K']
staten_is_tix = violations_kings.groupby('Violation code')['All Other Areas'].sum()
driver_staten_is_tix = staten_is_licenses / staten_is_tix.sum()
driver_staten_is_tix
In [ ]: