Loading Parking Violation Data and cleaning operations


In [7]:
import glob
import json
import pandas as pd
import requests

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
with open('dc_parking_violations.json', 'r') as f:
    parking_violations = json.load(f)

In [ ]:
for fullname, csv in parking_violations.items():
    download_file = csv + '.csv'
    local_filename = '_'.join(name.lower() for name in fullname.split() ) + '.csv'
    r = requests.get(download_file)
    with open(local_filename, 'wb') as f:
            f.write(r.content)

In [ ]:

Combine all csvs into one data frame


In [15]:
list_dfs = []
for file_name, url in list(parking_violations.items()):
    csv_file = url + '.csv'
    df = pd.read_csv(csv_file, index_col=None, header=0)    
    df['filename'] = file_name
       
    list_dfs.append(df)
    
frame = pd.concat(list_dfs)

# TESTs
assert frame.filename.nunique() == len(parking_violations)
frame.columns = [col.lower() for col in frame.columns]
frame = frame.reset_index(drop=True)
df = frame.copy()


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-15-e41bb7cde928> in <module>()
     10 
     11 # TESTs
---> 12 assert frame.filename.nunique() == len(parking_violations)
     13 frame.columns = [col.lower() for col in frame.columns]
     14 frame = frame.reset_index(drop=True)

AssertionError: 

In [ ]:


In [ ]:

Construct datetime object from issue_time raw string


In [ ]:
def mil_to_time(x):
    "Convert messy issue_time to datetime object based upon length of issue_time string"
    if x == 'nan':
        return '00:00:00.000Z'
        
    x = x.split('.')[0]
    lg = len(x)
    
    if lg == 4:
        t = x[:2] + ':' + x[2:] + ':00.000Z'
      
    elif lg == 3:
        t = '0' + x[0] + ':' + x[1:] + ':00.000Z'
    
    elif lg == 2:
        t = '0' + '0' + ':' + x + ':00.000Z'
    
    elif lg == 1:
        t = '0' + '0' + ':' + '0' + x + ':00.000Z'
    
    else:
        t = '00:00.000Z'
    
    # correction for timedate if one element is greater than 5.
    # double check this
    if int(t[3]) > 5:
        t = t[:2]+ ':' + '5' + t[4:]
        
    return t

In [ ]:
df['issue_time_military'] = df.issue_time.apply(str).apply(mil_to_time)
dates = df.ticket_issue_date.str[:10] + 'T' #+ 
df['ticket_issue_datetime'] = dates + df.issue_time_military

Optional Test that datetime contruction is correct


In [12]:
## Testing datetime format to ensure that everything is actually datetime
# for i, t in enumerate(df.ticket_issue_datetime):
#     try:
#         pd.to_datetime(t)
#     except:
#         print i, t

Holiday value to Boolean


In [ ]:
df['holiday'] = df.holiday != 0

Delete redundant columns


In [ ]:
del df['day_of_week']
del df['month_of_year']
del df['week_of_year']
del df['issue_time']
del df['issue_time_military']
del df['ticket_issue_date']

Drop Duplicates and Fill in Empy Rows


In [ ]:
df.drop_duplicates(subset='rowid_', inplace=True)
df.streetsegid.fillna(0, inplace=True)

In [ ]:
df.to_pickle('parking_eda.pkle')

Export to CSV


In [16]:
df.to_csv('../cleaned_data/clean_parking_violations.tsv', sep='\t', index=False)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-16-77f172269af4> in <module>()
----> 1 df.to_csv('../cleaned_data/clean_parking_violations.tsv', sep='\t', index=False)

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/core/frame.py in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal, **kwds)
   1342                                      doublequote=doublequote,
   1343                                      escapechar=escapechar, decimal=decimal)
-> 1344         formatter.save()
   1345 
   1346         if path_or_buf is None:

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/formats/format.py in save(self)
   1549 
   1550             else:
-> 1551                 self._save()
   1552 
   1553         finally:

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/formats/format.py in _save(self)
   1650                 break
   1651 
-> 1652             self._save_chunk(start_i, end_i)
   1653 
   1654     def _save_chunk(self, start_i, end_i):

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/formats/format.py in _save_chunk(self, start_i, end_i)
   1664                                   decimal=self.decimal,
   1665                                   date_format=self.date_format,
-> 1666                                   quoting=self.quoting)
   1667 
   1668             for col_loc, col in zip(b.mgr_locs, d):

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/core/internals.py in to_native_types(self, slicer, na_rep, float_format, decimal, quoting, **kwargs)
   1441                                         decimal=decimal, quoting=quoting,
   1442                                         fixed_width=False)
-> 1443         return formatter.get_result_as_array()
   1444 
   1445     def should_store(self, value):

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/formats/format.py in get_result_as_array(self)
   2169             float_format = self.float_format
   2170 
-> 2171         formatted_values = format_values_with(float_format)
   2172 
   2173         if not self.fixed_width:

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/formats/format.py in format_values_with(float_format)
   2155             imask = (~mask).ravel()
   2156             values.flat[imask] = np.array([formatter(val)
-> 2157                                            for val in values.ravel()[imask]])
   2158 
   2159             if self.fixed_width:

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/formats/format.py in <listcomp>(.0)
   2155             imask = (~mask).ravel()
   2156             values.flat[imask] = np.array([formatter(val)
-> 2157                                            for val in values.ravel()[imask]])
   2158 
   2159             if self.fixed_width:

/usr/local/var/pyenv/versions/3.5.2/envs/trafficdata/lib/python3.5/site-packages/pandas/formats/format.py in base_formatter(v)
   2106         else:
   2107             def base_formatter(v):
-> 2108                 return str(v) if notnull(v) else self.na_rep
   2109 
   2110         if self.decimal != '.':

KeyboardInterrupt: 

In [17]:
df.sample(10000).to_csv('../sampled_data/clean_parking_violations_10k_sample.tsv', sep='\t', index=False)

In [18]:
df.sample(100000).to_csv('../sampled_data/clean_parking_violations_100k_sample.tsv', sep='\t', index=False)

In [19]:
df.sample(1000000).to_csv('../sampled_data/clean_parking_violations_1million_sample.tsv', sep='\t', index=False)