notebook.community

Edit and run



In [39]:

    
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt









    



ERROR! Session/line number was not unique in database. History logging moved to new session 501



In [40]:

    
%matplotlib inline



In [41]:

    
import seaborn as sns



In [42]:

    
data_file = '../data/moving_violations.csv'



In [43]:

    
df = pd.read_csv(data_file)
df['ticketissuedate'] = pd.to_datetime(df.ticketissuedate)
df['totalpaid'] = df.totalpaid.fillna(0)



In [46]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3993673 entries, 0 to 3993672
Data columns (total 20 columns):
accidentindicator    object
address_id           float64
agencyid             float64
fineamt              float64
location             object
objectid             int64
penalty1             float64
penalty2             float64
row_                 int64
streetsegid          float64
ticketissuedate      datetime64[ns]
tickettype           object
totalpaid            float64
violationcode        object
violationdesc        object
xcoord               float64
y                    float64
ycoord               float64
filename             object
x                   float64
dtypes: datetime64[ns](1), float64(11), int64(2), object(6)
memory usage: 609.4+ MB



In [ ]:



In [9]:

    
df.groupby('accidentindicator').totalpaid.mean()









    Out[9]:





accidentindicator
No     94.883731
Yes    50.499074
Name: totalpaid, dtype: float64



In [10]:

    
df.groupby(['accidentindicator', 'tickettype' ]).totalpaid.mean()









    Out[10]:





accidentindicator  tickettype    
No                 Mover Void         0.147887
                   Moving            63.071702
                   No Ticket Type    36.336931
                   Parker Void        0.000000
                   Parking            1.724138
                   Photo             97.735412
Yes                Mover Void         0.000000
                   Moving            50.913755
                   No Ticket Type    27.017391
Name: totalpaid, dtype: float64



In [11]:

    
df['tickettype'] = df.tickettype.fillna('')



In [21]:

    
df.columns









    Out[21]:





Index([u'accidentindicator', u'address_id', u'agencyid', u'fineamt',
       u'location', u'objectid', u'penalty1', u'penalty2', u'row_',
       u'streetsegid', u'ticketissuedate', u'tickettype', u'totalpaid',
       u'violationcode', u'violationdesc', u'xcoord', u'y', u'ycoord',
       u'filename', u'x'],
      dtype='object')



In [12]:

    
voided_df = df[df.tickettype.str.contains('Void')]
voided = None



In [13]:

    
voided_df = voided_df.groupby(['accidentindicator', 'violationcode']).count().reset_index()



In [14]:

    
voided_df['count'] = voided_df.address_id



In [15]:

    
t_df = voided_df[['accidentindicator', 'violationcode', 'count']]



In [16]:

    
# t_df.sort_values('count', ascending=False)



In [17]:

    
sns.set_style("whitegrid")



In [ ]:



In [19]:

    
sns.boxplot(x="tickettype", y="totalpaid", data=df)









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x12190ab50>



In [ ]:



In [ ]:



In [20]:

    
sns.boxplot(x="tickettype", y="totalpaid", data=voided_df)









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x110ab7090>



In [29]:

    
ts = pd.Series(df.totalpaid, index=df.ticketissuedate)



In [30]:

    
ts = ts.cumsum()



In [45]:

    
len(df)









    Out[45]:





3993673



In [ ]:



In [44]:

    
df.sample(100000).to_csv('moving_violations_sample_large.csv', sep='\t', index=False)



In [ ]: