Loading Parking Violation Data and cleaning operations



In [1]:

    
import pandas as pd
from collections import namedtuple



In [52]:

    
pd.set_option('display.max_columns', 500)



In [8]:

    
may_2016 = "http://opendata.dc.gov/datasets/2e967e9053144a309680fccea0f7b4e1_11.csv"
file_name = 'Parking_violations_may_2016.csv'



In [ ]:

Combine all csvs into one data frame



In [9]:

    
df = pd.read_csv(may_2016,index_col=None, header=0)
df.columns = [col.lower() for col in df.columns]
df = df.reset_index(drop=True)

Construct datetime object from issue_time raw string



In [10]:

    
def mil_to_time(x):
    "Convert messy issue_time to datetime object based upon length of issue_time string"
    if x == 'nan':
        return '00:00:00.000Z'
        
    x = x.split('.')[0]
    lg = len(x)
    
    if lg == 4:
        t = x[:2] + ':' + x[2:] + ':00.000Z'
      
    elif lg == 3:
        t = '0' + x[0] + ':' + x[1:] + ':00.000Z'
    
    elif lg == 2:
        t = '0' + '0' + ':' + x + ':00.000Z'
    
    elif lg == 1:
        t = '0' + '0' + ':' + '0' + x + ':00.000Z'
    
    else:
        t = '00:00.000Z'
    
    # correction for timedate if one element is greater than 5.
    # double check this
    if int(t[3]) > 5:
        t = t[:2]+ ':' + '5' + t[4:]
        
    return t



In [11]:

    
df['issue_time_military'] = df.issue_time.apply(str).apply(mil_to_time)
dates = df.ticket_issue_date.str[:10] + 'T' #+ 
df['filename'] = file_name
df['ticket_issue_datetime'] = dates + df.issue_time_military

Optional Test that datetime contruction is correct



In [57]:

    
## Testing datetime format to ensure that everything is actually datetime
# for i, t in enumerate(df.ticket_issue_datetime):
#     try:
#         pd.to_datetime(t)
#     except:
#         print i, t

Holiday value to Boolean



In [12]:

    
df['holiday'] = df.holiday != 0

Delete redundant columns



In [41]:

    
# del df['day_of_week']
# del df['month_of_year']
# del df['week_of_year']
# del df['issue_time']
# del df['issue_time_military']
# del df['ticket_issue_date']



In [13]:

    
df.drop(['day_of_week', 'month_of_year', 'week_of_year', 'issue_time', 'issue_time_military', 'ticket_issue_date'   ], axis=1, inplace=True, errors='ignore')

Drop Duplicates and Fill in Empy Rows



In [14]:

    
df.drop_duplicates(subset='rowid_', inplace=True)
df.streetsegid.fillna(0, inplace=True)

Export to CSV



In [16]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 132850 entries, 0 to 132849
Data columns (total 16 columns):
x                       132850 non-null float64
y                        132850 non-null float64
objectid                 132850 non-null int64
rowid_                   132850 non-null int64
holiday                  132850 non-null bool
violation_code           132850 non-null object
violation_description    128535 non-null object
location                 132850 non-null object
rp_plate_state           132207 non-null object
body_style               131840 non-null object
address_id               132850 non-null int64
streetsegid              132850 non-null float64
xcoord                   132850 non-null int64
ycoord                   132850 non-null int64
filename                 132850 non-null object
ticket_issue_datetime    132850 non-null object
dtypes: bool(1), float64(3), int64(5), object(7)
memory usage: 16.3+ MB



In [ ]:



In [17]:

    
Parking = namedtuple('Parking', 'x, y, objectid, rowid, holiday, violation_code, \
                    violation_description, location, rp_plate_state, body_style, \
                    address_id, streetsegid, xcoord, ycoord, filename, \
                    ticket_issue_datetime')



In [22]:



In [18]:

    
rows = []
for index, row in df.iterrows():
    row = Parking._make(row)
    rows.append(row)



In [19]:

    
len(rows)









    Out[19]:





132850



In [ ]:



In [ ]:



In [ ]:



In [20]:

    
df.to_csv('sample.csv')



In [ ]: