In [39]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
In [40]:
%matplotlib inline
In [41]:
import seaborn as sns
In [42]:
data_file = '../data/moving_violations.csv'
In [43]:
df = pd.read_csv(data_file)
df['ticketissuedate'] = pd.to_datetime(df.ticketissuedate)
df['totalpaid'] = df.totalpaid.fillna(0)
In [46]:
df.info()
In [ ]:
In [9]:
df.groupby('accidentindicator').totalpaid.mean()
Out[9]:
In [10]:
df.groupby(['accidentindicator', 'tickettype' ]).totalpaid.mean()
Out[10]:
In [11]:
df['tickettype'] = df.tickettype.fillna('')
In [21]:
df.columns
Out[21]:
In [12]:
voided_df = df[df.tickettype.str.contains('Void')]
voided = None
In [13]:
voided_df = voided_df.groupby(['accidentindicator', 'violationcode']).count().reset_index()
In [14]:
voided_df['count'] = voided_df.address_id
In [15]:
t_df = voided_df[['accidentindicator', 'violationcode', 'count']]
In [16]:
# t_df.sort_values('count', ascending=False)
In [17]:
sns.set_style("whitegrid")
In [ ]:
In [19]:
sns.boxplot(x="tickettype", y="totalpaid", data=df)
Out[19]:
In [ ]:
In [ ]:
In [20]:
sns.boxplot(x="tickettype", y="totalpaid", data=voided_df)
Out[20]:
In [29]:
ts = pd.Series(df.totalpaid, index=df.ticketissuedate)
In [30]:
ts = ts.cumsum()
In [45]:
len(df)
Out[45]:
In [ ]:
In [44]:
df.sample(100000).to_csv('moving_violations_sample_large.csv', sep='\t', index=False)
In [ ]: