Loading Parking Violation Data and cleaning operations


In [1]:
import pandas as pd
from collections import namedtuple

In [52]:
pd.set_option('display.max_columns', 500)

In [8]:
may_2016 = "http://opendata.dc.gov/datasets/2e967e9053144a309680fccea0f7b4e1_11.csv"
file_name = 'Parking_violations_may_2016.csv'

In [ ]:

Combine all csvs into one data frame


In [9]:
df = pd.read_csv(may_2016,index_col=None, header=0)
df.columns = [col.lower() for col in df.columns]
df = df.reset_index(drop=True)

Construct datetime object from issue_time raw string


In [10]:
def mil_to_time(x):
    "Convert messy issue_time to datetime object based upon length of issue_time string"
    if x == 'nan':
        return '00:00:00.000Z'
        
    x = x.split('.')[0]
    lg = len(x)
    
    if lg == 4:
        t = x[:2] + ':' + x[2:] + ':00.000Z'
      
    elif lg == 3:
        t = '0' + x[0] + ':' + x[1:] + ':00.000Z'
    
    elif lg == 2:
        t = '0' + '0' + ':' + x + ':00.000Z'
    
    elif lg == 1:
        t = '0' + '0' + ':' + '0' + x + ':00.000Z'
    
    else:
        t = '00:00.000Z'
    
    # correction for timedate if one element is greater than 5.
    # double check this
    if int(t[3]) > 5:
        t = t[:2]+ ':' + '5' + t[4:]
        
    return t

In [11]:
df['issue_time_military'] = df.issue_time.apply(str).apply(mil_to_time)
dates = df.ticket_issue_date.str[:10] + 'T' #+ 
df['filename'] = file_name
df['ticket_issue_datetime'] = dates + df.issue_time_military

Optional Test that datetime contruction is correct


In [57]:
## Testing datetime format to ensure that everything is actually datetime
# for i, t in enumerate(df.ticket_issue_datetime):
#     try:
#         pd.to_datetime(t)
#     except:
#         print i, t

Holiday value to Boolean


In [12]:
df['holiday'] = df.holiday != 0

Delete redundant columns


In [41]:
# del df['day_of_week']
# del df['month_of_year']
# del df['week_of_year']
# del df['issue_time']
# del df['issue_time_military']
# del df['ticket_issue_date']

In [13]:
df.drop(['day_of_week', 'month_of_year', 'week_of_year', 'issue_time', 'issue_time_military', 'ticket_issue_date'   ], axis=1, inplace=True, errors='ignore')

Drop Duplicates and Fill in Empy Rows


In [14]:
df.drop_duplicates(subset='rowid_', inplace=True)
df.streetsegid.fillna(0, inplace=True)

Export to CSV


In [16]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 132850 entries, 0 to 132849
Data columns (total 16 columns):
x                       132850 non-null float64
y                        132850 non-null float64
objectid                 132850 non-null int64
rowid_                   132850 non-null int64
holiday                  132850 non-null bool
violation_code           132850 non-null object
violation_description    128535 non-null object
location                 132850 non-null object
rp_plate_state           132207 non-null object
body_style               131840 non-null object
address_id               132850 non-null int64
streetsegid              132850 non-null float64
xcoord                   132850 non-null int64
ycoord                   132850 non-null int64
filename                 132850 non-null object
ticket_issue_datetime    132850 non-null object
dtypes: bool(1), float64(3), int64(5), object(7)
memory usage: 16.3+ MB

In [ ]:


In [17]:
Parking = namedtuple('Parking', 'x, y, objectid, rowid, holiday, violation_code, \
                    violation_description, location, rp_plate_state, body_style, \
                    address_id, streetsegid, xcoord, ycoord, filename, \
                    ticket_issue_datetime')

In [22]:


In [18]:
rows = []
for index, row in df.iterrows():
    row = Parking._make(row)
    rows.append(row)

In [19]:
len(rows)


Out[19]:
132850

In [ ]:


In [ ]:


In [ ]:


In [20]:
df.to_csv('sample.csv')

In [ ]: