In [1]:
import pandas as pd
from collections import namedtuple
In [52]:
pd.set_option('display.max_columns', 500)
In [8]:
may_2016 = "http://opendata.dc.gov/datasets/2e967e9053144a309680fccea0f7b4e1_11.csv"
file_name = 'Parking_violations_may_2016.csv'
In [ ]:
In [9]:
df = pd.read_csv(may_2016,index_col=None, header=0)
df.columns = [col.lower() for col in df.columns]
df = df.reset_index(drop=True)
In [10]:
def mil_to_time(x):
"Convert messy issue_time to datetime object based upon length of issue_time string"
if x == 'nan':
return '00:00:00.000Z'
x = x.split('.')[0]
lg = len(x)
if lg == 4:
t = x[:2] + ':' + x[2:] + ':00.000Z'
elif lg == 3:
t = '0' + x[0] + ':' + x[1:] + ':00.000Z'
elif lg == 2:
t = '0' + '0' + ':' + x + ':00.000Z'
elif lg == 1:
t = '0' + '0' + ':' + '0' + x + ':00.000Z'
else:
t = '00:00.000Z'
# correction for timedate if one element is greater than 5.
# double check this
if int(t[3]) > 5:
t = t[:2]+ ':' + '5' + t[4:]
return t
In [11]:
df['issue_time_military'] = df.issue_time.apply(str).apply(mil_to_time)
dates = df.ticket_issue_date.str[:10] + 'T' #+
df['filename'] = file_name
df['ticket_issue_datetime'] = dates + df.issue_time_military
In [57]:
## Testing datetime format to ensure that everything is actually datetime
# for i, t in enumerate(df.ticket_issue_datetime):
# try:
# pd.to_datetime(t)
# except:
# print i, t
In [12]:
df['holiday'] = df.holiday != 0
In [41]:
# del df['day_of_week']
# del df['month_of_year']
# del df['week_of_year']
# del df['issue_time']
# del df['issue_time_military']
# del df['ticket_issue_date']
In [13]:
df.drop(['day_of_week', 'month_of_year', 'week_of_year', 'issue_time', 'issue_time_military', 'ticket_issue_date' ], axis=1, inplace=True, errors='ignore')
In [14]:
df.drop_duplicates(subset='rowid_', inplace=True)
df.streetsegid.fillna(0, inplace=True)
In [16]:
df.info()
In [ ]:
In [17]:
Parking = namedtuple('Parking', 'x, y, objectid, rowid, holiday, violation_code, \
violation_description, location, rp_plate_state, body_style, \
address_id, streetsegid, xcoord, ycoord, filename, \
ticket_issue_datetime')
In [22]:
In [18]:
rows = []
for index, row in df.iterrows():
row = Parking._make(row)
rows.append(row)
In [19]:
len(rows)
Out[19]:
In [ ]:
In [ ]:
In [ ]:
In [20]:
df.to_csv('sample.csv')
In [ ]: