In [17]:
import pandas as pd
import numpy as np
import re
from dateutil.parser import parse
In [18]:
data = pd.read_csv('Signs.csv')
data.head()
Out[18]:
In [19]:
data['SIGNDESC1'].head()
Out[19]:
In [45]:
# https://regex101.com/r/fC0lI5/10
p = re.compile(r'(NOON|MIDNIGHT.*|[01]?[0-9]+:?[0-9]*(?:[AP]MM?)?)\s*(?:-|TO|\s)\s*(NOON|MIDNIGHT|[01]?[0-9]+:?[0-9]*(?:[AP]MM?)?)')
def extract_time(desc, group=1):
m = p.search(desc)
if m:
time = m.group(group)
if time == 'MIDNIGHT':
return '12AM'
elif time == 'MIDNIGHT TO':
return '12AM'
elif time == 'NOON':
return '12PM'
elif 'MM' in time:
match_amm_or_pmm = re.compile(r'([0-9])*?([AP]MM?)')
matched_time = match_amm_or_pmm.search(time)
if matched_time:
meridiem = 'AM' if matched_time.group(2) == 'AMM' else 'PM'
oclock = matched_time.group(1)
return '{0}{1}'.format(oclock, meridiem)
return m.group(group)
return np.nan
def from_time(signdesc):
f_time = extract_time(signdesc, group=1)
if f_time or not np.nan:
return parse(f_time).strftime('%I:%M%p')
return None
def to_time(signdesc):
t_time = extract_time(signdesc, group=2)
if t_time or not np.nan:
return parse(t_time).strftime('%I:%M%p')
return None
In [46]:
row = 'NO PARKING (SANITATION BROOM SYMBOL) 7AM-7:30AM EXCEPT SUNDAY'
assert from_time(row) == '07:00AM'
assert to_time(row) == '07:30AM'
special_case1 = 'NO PARKING (SANITATION BROOM SYMBOL) 11:30AM TO 1PM THURS'
assert from_time(special_case1) == '11:30AM'
assert to_time(special_case1) == '01:00PM'
special_case2 = 'NO PARKING (SANITATION BROOM SYMBOL) MOON & STARS (SYMBOLS) TUESDAY FRIDAY MIDNIGHT-3AM'
assert from_time(special_case2) == '12:00AM'
assert to_time(special_case2) == '03:00AM'
special_case3 = 'TRUCK (SYMBOL) TRUCK LOADING ONLY MONDAY-FRIDAY NOON-2PM'
assert from_time(special_case3) == '12:00PM'
assert to_time(special_case3) == '02:00PM'
special_case4 = 'NIGHT REGULATION (MOON & STARS SYMBOLS) NO PARKING (SANITATION BROOM SYMBOL) MIDNIGHT TO-3AM WED & SAT'
assert from_time(special_case4) == '12:00AM'
assert to_time(special_case4) == '03:00AM'
special_case5 = 'NO PARKING (SANITATION BROOM SYMBOL)8AM 11AM TUES & THURS'
assert from_time(special_case5) == '08:00AM'
assert to_time(special_case5) == '11:00AM'
special_case6 = 'NO PARKING (SANITATION BROOM SYMBOL) MONDAY THURSDAY 7AMM-7:30AM'
assert from_time(special_case6) == '07:00AM'
assert to_time(special_case6) == '07:30AM'
In [50]:
def filter_from_time(row):
if not pd.isnull(row['SIGNDESC1']):
print(from_time(row['SIGNDESC1']))
return from_time(row['SIGNDESC1'])
return np.nan
In [51]:
def filter_to_time(row):
if not pd.isnull(row['SIGNDESC1']):
return to_time(row['SIGNDESC1'])
return np.nan
In [52]:
data['FROM_TIME'] = data.apply(filter_from_time, axis=1)
In [ ]:
data['TO_TIME'] = data.apply(filter_to_time, axis=1)
In [ ]:
data[['SIGNDESC1', 'FROM_TIME', 'TO_TIME']].head(10)
In [ ]:
rows_with_AM_PM_but_time_NaN = data[(data['FROM_TIME'].isnull() | data['FROM_TIME'].isnull()) & (data['SIGNDESC1'].str.contains('[0-9]+(?:[AP]M)'))]
In [ ]:
len(rows_with_AM_PM_but_time_NaN)
In [ ]:
rows_with_AM_PM_but_time_NaN[['SIGNDESC1', 'FROM_TIME', 'TO_TIME']]
In [ ]:
data.iloc[180670, data.columns.get_loc('SIGNDESC1')]
In [ ]:
data.iloc[180670, data.columns.get_loc('FROM_TIME')] = '9AM'
data.iloc[180670, data.columns.get_loc('TO_TIME')] = '4AM'
In [ ]:
data.iloc[212089, data.columns.get_loc('SIGNDESC1')]
In [ ]:
data.iloc[212089, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[212089, data.columns.get_loc('TO_TIME')] = '11:30AM'
In [ ]:
data.iloc[258938, data.columns.get_loc('SIGNDESC1')]
In [ ]:
data.iloc[258938, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[258938, data.columns.get_loc('TO_TIME')] = '11:30AM'
In [ ]:
data.iloc[258942, data.columns.get_loc('SIGNDESC1')]
In [ ]:
data.iloc[258942, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[258942, data.columns.get_loc('TO_TIME')] = '11:30AM'
In [ ]:
data.iloc[258944, data.columns.get_loc('SIGNDESC1')]
In [ ]:
data.iloc[258944, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[258944, data.columns.get_loc('TO_TIME')] = '11:30AM'
In [ ]:
data.iloc[283262, data.columns.get_loc('SIGNDESC1')]
In [ ]:
data.iloc[283262, data.columns.get_loc('FROM_TIME')] = '6AM'
data.iloc[283262, data.columns.get_loc('TO_TIME')] = '7:30AM'
Confirm that every row has from_time and to_time
In [ ]:
rows_with_AM_PM_but_time_NaN = data[(data['FROM_TIME'].isnull() | data['FROM_TIME'].isnull()) & (data['SIGNDESC1'].str.contains('[0-9]+(?:[AP]M)'))]
len(rows_with_AM_PM_but_time_NaN)
In [ ]:
data[['SIGNDESC1', 'FROM_TIME', 'TO_TIME']]
In [ ]:
data['SIGNDESC1'].head(20)
In [ ]:
#https://regex101.com/r/fO4zL8/3
regex_to_extract_days_idv_days = r'\b((?:(?:MON|MONDAY|TUES|TUESDAY|WED|WEDNESDAY|THURS|THURSDAY|FRI|FRIDAY|SAT|SATURDAY|SUN|SUNDAY)\s*)+)(?=\s|$)'
regex_to_extract_days_with_range = r'(MON|TUES|WED|THURS|FRI|SAT|SUN)\s(THRU|\&)\s(MON|TUES|WED|THURS|FRI|SAT|SUN)'
def extract_day(signdesc):
days = ['MON', 'TUES', 'WED', 'THURS', 'FRI', 'SAT', 'SUN']
p_idv_days = re.compile(regex_to_extract_days_idv_days)
m_idv_days = p_idv_days.search(signdesc)
p_range_days = re.compile(regex_to_extract_days_with_range)
m_range_days = p_range_days.search(signdesc)
if 'EXCEPT SUN' in signdesc:
return ', '.join(days[:6])
if 'INCLUDING SUNDAY' in signdesc:
return ', '.join(days)
if 'FRIW/' in signdesc:
return ', '.join(['FRI'])
if ('THRU' in signdesc) and m_range_days:
from_day = m_range_days.group(1)
to_day = m_range_days.group(3)
idx_frm_d = days.index(from_day)
idx_to_d = days.index(to_day)
return ', '.join([days[n] for n in range(idx_frm_d, idx_to_d + 1)])
if ('&' in signdesc) and m_range_days:
from_day = m_range_days.group(1)
to_day = m_range_days.group(3)
return ', '.join([from_day, to_day])
if m_idv_days:
days = m_idv_days.group(1)
d = []
for day in days.split(' '):
if len(day) > 3:
if day in ['MONDAY', 'WEDNESDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']:
d.append(day[:3])
if day in ['TUESDAY']:
d.append(day[:4])
if day in ['THURSDAY']:
d.append(day[:5])
else:
d.append(day)
return ', '.join(d)
return np.nan
In [ ]:
def filter_days(row):
if not pd.isnull(row['SIGNDESC1']):
return extract_day(row['SIGNDESC1'])
return np.nan
In [ ]:
assert extract_day('NO STANDING 11AM-7AM MON SAT') == "MON, SAT"
assert extract_day('NO STANDING MON FRI 7AM-9AM') == "MON, FRI"
assert extract_day('2 HOUR PARKING 9AM-5PM MON THRU SAT') == "MON, TUES, WED, THURS, FRI, SAT"
assert extract_day('1 HOUR PARKING 8AM-7PM EXCEPT SUNDAY') == "MON, TUES, WED, THURS, FRI, SAT"
assert extract_day('NO PARKING 10PM-8AM INCLUDING SUNDAY') == "MON, TUES, WED, THURS, FRI, SAT, SUN"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) MONDAY THURSDAY 9:30AM-11AM') == "MON, THURS"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) 11:30AM TO 1 PM FRIW/ SINGLE ARROW') == "FRI"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) 8-9:30AM TUES & FRI') == "TUES, FRI"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) TUESDAY FRIDAY 11AM-12:30PM') == "TUES, FRI"
In [ ]:
data['DAYS'] = data.apply(filter_days, axis=1)
In [ ]:
rows_with_days_but_DAYS_NAN = data[data['DAYS'].isnull() & data['SIGNDESC1'].str.contains('\sMON|\sTUES|\sWED|\sTHURS|\sFRI|\sSAT|\sSUN')]
In [ ]:
rows_with_days_but_DAYS_NAN[['SIGNDESC1', 'DAYS']]
In [ ]:
data.iloc[308838, data.columns.get_loc('SIGNDESC1')]
In [ ]:
data.head()
In [ ]:
data.to_csv('Processed_Signs.csv', index=False)
In [ ]: