The raw data contains the following data per station per reading:
The following variables will be derived from the raw data.
In [1]:
%matplotlib inline
import logging
import itertools
import json
import os
import pickle
import folium
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from IPython.display import Image
from datetime import date
from src.data.parse_dataset import parse_dir, parse_json_files, get_file_list
from src.data.string_format import format_name, to_short_name
from src.data.visualization import lon_min_longitude, lon_min_latitude, lon_max_longitude, lon_max_latitude, lon_center_latitude, lon_center_longitude, create_london_map
logger = logging.getLogger()
logger.setLevel(logging.INFO)
In [2]:
def parse_cycles(json_obj):
"""Parses TfL's BikePoint JSON response"""
return [parse_station(element) for element in json_obj]
def parse_station(element):
"""Parses a JSON bicycle station object to a dictionary"""
obj = {
'Id': element['id'],
'Name': element['commonName'],
'Latitude': element['lat'],
'Longitude': element['lon'],
'PlaceType': element['placeType'],
}
for p in element['additionalProperties']:
obj[p['key']] = p['value']
if 'timestamp' not in obj:
obj['Timestamp'] = p['modified']
elif obj['Timestamp'] != p['modified']:
raise ValueError('The properties\' timestamps for station %s do not match: %s != %s' % (
obj['id'], obj['Timestamp'], p['modified']))
return obj
In [3]:
def bike_file_date_fn(file_name):
"""Gets the file's date"""
return datetime.strptime(os.path.basename(file_name), 'BIKE-%Y-%m-%d:%H:%M:%S.json')
def create_between_dates_filter(file_date_fn, date_start, date_end):
def filter_fn(file_name):
file_date = file_date_fn(file_name)
return file_date >= date_start and file_date <= date_end
return filter_fn
In [4]:
filter_fn = create_between_dates_filter(bike_file_date_fn,
datetime(2016, 5, 16, 7, 0, 0),
datetime(2016, 5, 16, 23, 59, 59))
records = parse_dir('/home/jfconavarrete/Documents/Work/Dissertation/spts-uoe/data/raw/cycles',
parse_cycles, sort_fn=bike_file_date_fn, filter_fn=filter_fn)
# records is a list of lists of dicts
df = pd.DataFrame(list(itertools.chain.from_iterable(records)))
In [5]:
df.head()
Out[5]:
In [6]:
df[df['Id'] == 'BikePoints_1'].head()
Out[6]:
Due to memory constraints we'll parse the data in chunks. In each chunk we'll remove the redundant candidate keys and also duplicate rows.
In [7]:
def chunker(seq, size):
return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))
In [8]:
def split_data(parsed_data):
master_df = pd.DataFrame(list(itertools.chain.from_iterable(parsed_data)))
readings_df = pd.DataFrame(master_df, columns=['Id', 'Timestamp', 'NbBikes', 'NbDocks', 'NbEmptyDocks'])
stations_df = pd.DataFrame(master_df, columns=['Id', 'Name', 'TerminalName' , 'PlaceType', 'Latitude',
'Longitude', 'Installed', 'Temporary', 'Locked',
'RemovalDate', 'InstallDate'])
return (readings_df, stations_df)
In [ ]:
# get the files to parse
five_weekdays_filter = create_between_dates_filter(bike_file_date_fn,
datetime(2016, 6, 19, 0, 0, 0),
datetime(2016, 6, 27, 23, 59, 59))
files = get_file_list('data/raw/cycles', filter_fn=None, sort_fn=bike_file_date_fn)
# process the files in chunks
files_batches = chunker(files, 500)
In [ ]:
# start with an empty dataset
readings_dataset = pd.DataFrame()
stations_dataset = pd.DataFrame()
# append each chunk to the datasets while removing duplicates
for batch in files_batches:
parsed_data = parse_json_files(batch, parse_cycles)
# split the data into two station data and readings data
readings_df, stations_df = split_data(parsed_data)
# append the datasets
readings_dataset = pd.concat([readings_dataset, readings_df])
stations_dataset = pd.concat([stations_dataset, stations_df])
# remove duplicated rows
readings_dataset.drop_duplicates(inplace=True)
stations_dataset.drop_duplicates(inplace=True)
In [ ]:
# put the parsed data in pickle files
pickle.dump(readings_dataset, open("data/parsed/readings_dataset_raw.p", "wb"))
pickle.dump(stations_dataset, open("data/parsed/stations_dataset_raw.p", "wb"))
In [9]:
stations_dataset = pickle.load(open('data/parsed/stations_dataset_raw.p', 'rb'))
readings_dataset = pickle.load(open('data/parsed/readings_dataset_raw.p', 'rb'))
In [10]:
# convert columns to their appropriate datatypes
stations_dataset['InstallDate'] = pd.to_numeric(stations_dataset['InstallDate'], errors='raise')
stations_dataset['RemovalDate'] = pd.to_numeric(stations_dataset['RemovalDate'], errors='raise')
stations_dataset['Installed'].replace({'true': True, 'false': False}, inplace=True)
stations_dataset['Temporary'].replace({'true': True, 'false': False}, inplace=True)
stations_dataset['Locked'].replace({'true': True, 'false': False}, inplace=True)
readings_dataset['NbBikes'] = readings_dataset['NbBikes'].astype('uint16')
readings_dataset['NbDocks'] = readings_dataset['NbDocks'].astype('uint16')
readings_dataset['NbEmptyDocks'] = readings_dataset['NbEmptyDocks'].astype('uint16')
In [11]:
# format station name
stations_dataset['Name'] = stations_dataset['Name'].apply(format_name)
In [12]:
# convert string timestamp to datetime
stations_dataset['InstallDate'] = pd.to_datetime(stations_dataset['InstallDate'], unit='ms', errors='raise')
stations_dataset['RemovalDate'] = pd.to_datetime(stations_dataset['RemovalDate'], unit='ms', errors='raise')
readings_dataset['Timestamp'] = pd.to_datetime(readings_dataset['Timestamp'], format='%Y-%m-%dT%H:%M:%S.%f', errors='raise').dt.tz_localize('UTC')
In [13]:
# sort the datasets
stations_dataset.sort_values(by=['Id'], ascending=True, inplace=True)
readings_dataset.sort_values(by=['Timestamp'], ascending=True, inplace=True)
In [14]:
stations_dataset['ShortName'] = stations_dataset['Name'].apply(to_short_name)
readings_dataset['NbUnusableDocks'] = readings_dataset['NbDocks'] - (readings_dataset['NbBikes'] + readings_dataset['NbEmptyDocks'])
Priorities downloaded from https://www.whatdotheyknow.com/request/tfl_boris_bike_statistics?unfold=1
In [15]:
stations_priorities = pd.read_csv('data/raw/priorities/station_priorities.csv', encoding='latin-1')
stations_priorities['Site'] = stations_priorities['Site'].apply(format_name)
In [16]:
stations_dataset = pd.merge(stations_dataset, stations_priorities, how='left', left_on='ShortName', right_on='Site')
stations_dataset['Priority'].replace({'One': '1', 'Two': '2', 'Long Term Suspended': np.NaN, 'Long term suspension': np.NaN}, inplace=True)
stations_dataset.drop(['Site'], axis=1, inplace=True)
stations_dataset.drop(['Borough'], axis=1, inplace=True)
In [17]:
stations_dataset
Out[17]:
In [18]:
stations_dataset.shape
Out[18]:
In [19]:
stations_dataset.info(memory_usage='deep')
In [20]:
stations_dataset.head()
Out[20]:
In [21]:
stations_dataset.describe()
Out[21]:
In [22]:
stations_dataset.apply(lambda x:x.nunique())
Out[22]:
In [23]:
stations_dataset.isnull().sum()
Out[23]:
In [24]:
def find_duplicate_ids(df):
"""Find Ids that have more than one value in the given columns"""
df = df.drop_duplicates()
value_counts_grouped_by_id = df.groupby('Id').count()
is_duplicate_id = value_counts_grouped_by_id.applymap(lambda x: x > 1).any(axis=1)
duplicate_ids = value_counts_grouped_by_id[is_duplicate_id == True].index.values
return df[df['Id'].isin(duplicate_ids)]
diplicate_ids = find_duplicate_ids(stations_dataset)
diplicate_ids
Out[24]:
Given these records have the same location and Id but different Name or TerminalName, we'll assume the station changed name and remove the first entries.
In [25]:
# remove the one not in merchant street
stations_dataset.drop(417, inplace=True)
# remove the one with the shortest name
stations_dataset.drop(726, inplace=True)
# remove the one that is not in kings cross (as the name of the station implies)
stations_dataset.drop(745, inplace=True)
# remove the duplicated entries
stations_dataset.drop([747, 743, 151, 754, 765, 768], inplace=True)
In [26]:
# make sure there are no repeated ids
assert len(find_duplicate_ids(stations_dataset)) == 0
Let's have a closer look at the station locations. All of them should be in Greater London.
In [27]:
def find_locations_outside_box(locations, min_longitude, min_latitude, max_longitude, max_latitude):
latitude_check = ~(locations['Latitude'] >= min_latitude) & (locations['Latitude'] <= max_latitude)
longitude_check = ~(locations['Longitude'] >= min_longitude) & (locations['Longitude'] <= max_longitude)
return locations[(latitude_check | longitude_check)]
outlier_locations_df = find_locations_outside_box(stations_dataset, lon_min_longitude, lon_min_latitude,
lon_max_longitude, lon_max_latitude)
outlier_locations_df
Out[27]:
This station looks like a test dation, so we'll remove it.
In [28]:
outlier_locations_idx = outlier_locations_df.index.values
stations_dataset.drop(outlier_locations_idx, inplace=True)
In [29]:
# make sure there are no stations outside London
assert len(find_locations_outside_box(stations_dataset, lon_min_longitude, lon_min_latitude,
lon_max_longitude, lon_max_latitude)) == 0
We will investigate the fact that there are stations with duplicate latitude or longitude values.
In [30]:
# find stations with duplicate longitude
id_counts_groupedby_longitude = stations_dataset.groupby('Longitude')['Id'].count()
nonunique_longitudes = id_counts_groupedby_longitude[id_counts_groupedby_longitude != 1].index.values
nonunique_longitude_stations = stations_dataset[stations_dataset['Longitude'].isin(nonunique_longitudes)].sort_values(by=['Longitude'])
id_counts_groupedby_latitude = stations_dataset.groupby('Latitude')['Id'].count()
nonunique_latitudes = id_counts_groupedby_latitude[id_counts_groupedby_latitude != 1].index.values
nonunique_latitudes_stations = stations_dataset[stations_dataset['Latitude'].isin(nonunique_latitudes)].sort_values(by=['Latitude'])
nonunique_coordinates_stations = pd.concat([nonunique_longitude_stations, nonunique_latitudes_stations])
nonunique_coordinates_stations
Out[30]:
In [31]:
def draw_stations_map(stations_df):
stations_map = create_london_map()
for index, station in stations_df.iterrows():
folium.Marker([station['Latitude'],station['Longitude']], popup=station['Name']).add_to(stations_map)
return stations_map
In [32]:
draw_stations_map(nonunique_coordinates_stations)
Out[32]:
We can observe that the stations are different and that having the same Longitude is just a coincidence.
Let's plot all the stations in a map to see how it looks
In [33]:
london_longitude = -0.127722
london_latitude = 51.507981
MAX_RECORDS = 100
stations_map = create_london_map()
for index, station in stations_dataset[0:MAX_RECORDS].iterrows():
folium.Marker([station['Latitude'],station['Longitude']], popup=station['Name']).add_to(stations_map)
stations_map
#folium.Map.save(stations_map, 'reports/maps/stations_map.html')
Out[33]:
In [34]:
readings_dataset.shape
Out[34]:
In [35]:
readings_dataset.info(memory_usage='deep')
In [36]:
readings_dataset.head()
Out[36]:
In [37]:
readings_dataset.describe()
Out[37]:
In [38]:
readings_dataset.apply(lambda x:x.nunique())
Out[38]:
In [39]:
readings_dataset.isnull().sum()
Out[39]:
In [40]:
timestamps = readings_dataset['Timestamp']
ax = timestamps.groupby([timestamps.dt.year, timestamps.dt.month, timestamps.dt.day]).count().plot(kind="bar")
ax.set_xlabel('Date')
ax.set_title('Readings per Day')
Out[40]:
In [41]:
start_date = date(2016, 5, 15)
end_date = date(2016, 6, 27)
days = set(pd.date_range(start=start_date, end=end_date, closed='left'))
readings_dataset = readings_dataset[(timestamps > start_date) & (timestamps < end_date)]
In [42]:
# get a subview of the readings dataset
id_timestamp_view = readings_dataset.loc[:,['Id','Timestamp']]
# remove the time component of the timestamp
id_timestamp_view['Timestamp'] = id_timestamp_view['Timestamp'].apply(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0))
# compute the days of readings per stations
days_readings = id_timestamp_view.groupby('Id').aggregate(lambda x: set(x))
days_readings['MissingDays'] = days_readings['Timestamp'].apply(lambda x: list(days - x))
days_readings['MissingDaysCount'] = days_readings['MissingDays'].apply(lambda x: len(x))
In [43]:
pickle.dump(days_readings.query('MissingDaysCount > 0'), open("data/parsed/missing_days.p", "wb"))
In [44]:
def expand_datetime(df, datetime_col):
df['Weekday'] = df[datetime_col].apply(lambda x: x.weekday())
return df
In [45]:
# get the stations with missing readings only
missing_days_readings = days_readings[days_readings['MissingDaysCount'] != 0]
missing_days_readings = missing_days_readings['MissingDays'].apply(lambda x: pd.Series(x)).unstack().dropna()
missing_days_readings.index = missing_days_readings.index.droplevel()
# sort and format in their own DF
missing_days_readings = pd.DataFrame(missing_days_readings, columns=['MissingDay'], index=None).reset_index().sort_values(by=['Id', 'MissingDay'])
# expand the missing day date
expand_datetime(missing_days_readings, 'MissingDay')
Out[45]:
In [46]:
missing_days_readings
Out[46]:
In [47]:
missing_days_readings['Id'].nunique()
Out[47]:
In [48]:
# plot the missing readings days
days = missing_days_readings['MissingDay']
missing_days_counts = days.groupby([days.dt.year, days.dt.month, days.dt.day]).count()
ax = missing_days_counts.plot(kind="bar")
ax.set_xlabel('Date')
ax.set_ylabel('Number of Stations')
Out[48]:
Stations with no readings in at least one day
In [49]:
missing_days_readings_stations = stations_dataset[stations_dataset['Id'].isin(missing_days_readings['Id'].unique())]
draw_stations_map(missing_days_readings_stations)
Out[49]:
Stations with no readings in at least one day during the weekend
In [50]:
weekend_readings = missing_days_readings[missing_days_readings['Weekday'] > 4]
missing_dayreadings_stn = stations_dataset[stations_dataset['Id'].isin(weekend_readings['Id'].unique())]
draw_stations_map(missing_dayreadings_stn)
Out[50]:
Stations with no readings in at least one day during weekdays
In [51]:
weekday_readings = missing_days_readings[missing_days_readings['Weekday'] < 5]
missing_dayreadings_stn = stations_dataset[stations_dataset['Id'].isin(weekday_readings['Id'].unique())]
draw_stations_map(missing_dayreadings_stn)
Out[51]:
Observations:
In [59]:
stations_to_remove = set(readings_dataset.Id) - set(stations_dataset.Id)
In [60]:
readings_dataset = readings_dataset[~readings_dataset.Id.isin(stations_to_remove)]
In [62]:
readings_dataset.reset_index(inplace=True, drop=True)
In [63]:
readings_dataset.head()
Out[63]:
In [65]:
readings_dataset.describe()
Out[65]:
In [66]:
readings_dataset.info(memory_usage='deep')
In [67]:
pickle.dump(readings_dataset, open("data/parsed/readings_dataset_utc.p", "wb"))
In [68]:
stations_dataset.reset_index(inplace=True, drop=True)
In [69]:
stations_dataset.head()
Out[69]:
In [70]:
stations_dataset.describe()
Out[70]:
In [71]:
stations_dataset.info(memory_usage='deep')
In [72]:
pickle.dump(stations_dataset, open("data/parsed/stations_dataset_final.p", "wb"))