In [4]:
import itertools
import pandas as pd
from scripts.preprocess.parse_json import parse_dir
In [5]:
def parse_station(element):
"""Parses a JSON bicycle station object to a dictionary"""
obj = {
'Id': element['id'],
'Name': element['commonName'],
'Latitude': element['lat'],
'Longitude': element['lon'],
'PlaceType': element['placeType']
}
for p in element['additionalProperties']:
obj[p['key']] = p['value']
if 'timestamp' not in obj:
obj['Timestamp'] = p['modified']
elif obj['Timestamp'] != p['modified']:
raise ValueError('The properties\' timestamps for station %s do not match: %s != %s' % (
obj['id'], obj['Timestamp'], p['modified']))
return obj
def parse_cycles(json_obj):
"""Parses TfL's BikePoint JSON response"""
return [parse_station(element) for element in json_obj]
In [6]:
records = parse_dir('/home/jfconavarrete/Documents/Work/Dissertation/spts-uoe/data/dev', parse_cycles)
In [7]:
dataset = pd.DataFrame(list(itertools.chain.from_iterable(records)))
dataset.shape
Out[7]:
In [1]:
# convert columns to their appropriate datatypes
dataset['InstallDate'] = pd.to_numeric(dataset['InstallDate'], errors='raise')
dataset['Installed'] = dataset['Installed'].astype('bool_')
dataset['Temporary'] = dataset['Temporary'].astype('bool_')
dataset['Locked'] = dataset['Locked'].astype('bool_')
dataset['NbBikes'] = dataset['NbBikes'].astype('uint16')
dataset['NbDocks'] = dataset['NbDocks'].astype('uint16')
dataset['NbEmptyDocks'] = dataset['NbEmptyDocks'].astype('uint16')
# convert string timestamp to datetime
dataset['Timestamp'] = pd.to_datetime(dataset['Timestamp'], format='%Y-%m-%dT%H:%M:%S.%f')
dataset['InstallDate'] = pd.to_datetime(dataset['InstallDate'], unit='ms')
In [16]:
dataset.info(memory_usage='deep')
In [14]:
dataset.head()
Out[14]:
In [ ]: