Bike Redistribution Preprocessing

Set Up



In [180]:

    
%matplotlib inline

import logging
import itertools
import json
import os
import re
import pickle
import folium
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from IPython.display import Image
from datetime import date

from src.data.parse_dataset import parse_dir, parse_json_files, get_file_list
from src.data.string_format import format_name, to_short_name
from src.data.visualization import lon_min_longitude, lon_min_latitude, lon_max_longitude, lon_max_latitude, lon_center_latitude, lon_center_longitude, create_london_map

logger = logging.getLogger()
logger.setLevel(logging.INFO)



In [181]:

    
collected = pd.read_csv('data/raw/redistribution/collected.csv', encoding='latin-1')
distributed = pd.read_csv('data/raw/redistribution/distributed.csv', encoding='latin-1')
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))

Technically Correct Data



In [182]:

    
# remove null entries, the dataset has very few features to support nulls
collected.dropna(inplace=True)
distributed.dropna(inplace=True)

# convert columns to their appropriate datatypes
collected['NbBikes'] = collected['NbBikes'].astype('uint16')
distributed['NbBikes'] = distributed['NbBikes'].astype('uint16')

# format station name
distributed['Name'] = distributed['Name'].apply(format_name)
collected['Name'] = collected['Name'].apply(format_name)

distributed['Timestamp'] =  pd.to_datetime(distributed['Timestamp'], format='%d/%m/%Y %H:%M', errors='raise').dt.tz_localize('UTC')
collected['Timestamp'] =  pd.to_datetime(collected['Timestamp'], format='%d/%m/%Y %H:%M', errors='raise').dt.tz_localize('UTC')

Distributed Cycles



In [183]:

    
distributed.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 39692 entries, 0 to 39691
Data columns (total 3 columns):
Timestamp    39692 non-null datetime64[ns, UTC]
Name         39692 non-null object
NbBikes      39692 non-null uint16
dtypes: datetime64[ns, UTC](1), object(1), uint16(1)
memory usage: 1007.8+ KB

Collected Cycles



In [184]:

    
collected.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 43488 entries, 0 to 43487
Data columns (total 3 columns):
Timestamp    43488 non-null datetime64[ns, UTC]
Name         43488 non-null object
NbBikes      43488 non-null uint16
dtypes: datetime64[ns, UTC](1), object(1), uint16(1)
memory usage: 1.1+ MB

Derive Data



In [185]:

    
distributed['ShortName'] = distributed['Name'].map(to_short_name)
collected['ShortName'] = collected['Name'].map(to_short_name)

Consistent Data

Distributed Cycles



In [186]:

    
distributed.describe()









    Out[186]:






  
    
      
      NbBikes
    
  
  
    
      count
      39692.000000
    
    
      mean
      9.956717
    
    
      std
      5.458785
    
    
      min
      0.000000
    
    
      25%
      6.000000
    
    
      50%
      9.000000
    
    
      75%
      15.000000
    
    
      max
      36.000000

Assign Station Id



In [187]:

    
def drop_multinomial(idxs, merged_incorrectly, merged_correctly):
    # get the df of the given indexes
    df = merged_incorrectly.loc[idxs]
    
    # get the counts of the station ids in the dataset
    selector = merged_correctly['Id'].isin(df['Id'])
    counts = merged_correctly[selector]['Id'].value_counts()
    
    # choose one station with the multinomial distribution
    probs = counts / counts.sum()
    multinomial_dist = np.random.multinomial(1, probs)
    station_id = counts.index[np.argmax(multinomial_dist)]
    
    # drop the other ones
    to_drop_selector = df['Id'] != station_id
    to_drop = df[to_drop_selector].index.values
    return to_drop    

def drop_randomly(idxs, merged_incorrectly=None, merged_correctly=None):
    idxs.remove(random.sample(idxs, 1)[0])
    return idxs



In [188]:

    
def split_null(df):
    return df[~df['Id'].isnull()].copy(), df[df['Id'].isnull()][['Timestamp', 'Name', 'NbBikes', 'ShortName']].copy()

def assign_station_id(df, drop_using):
    # merge using the full station name
    merged = pd.merge(df, stations[['Id', 'Name']], how='left', left_on='Name', right_on='Name')
    merged_on_name, remaining_null = split_null(merged)
    print '%d readings could not be merged with the full station name' % len(remaining_null) 
    
    if drop_using is None:
        return merged_on_name
    
    # merge using the short name
    merged = pd.merge(remaining_null, stations[['Id', 'ShortName']], how='left', left_on='ShortName', right_on='ShortName')
    merged_on_shortname, remaining_null = split_null(merged)
    print '%d readings could not be merged with the short station name' % len(remaining_null) 

    # drop duplicate entries from merging by short name
    # select the duplicate entries only
    selector = merged_on_shortname.duplicated(subset=['Name', 'Timestamp', 'NbBikes'], keep=False)
    duplicates = pd.DataFrame(merged_on_shortname[selector])
    # add the index as a column for selection
    duplicates['Idx'] = duplicates.index
    # group the duplicates
    groups = duplicates.groupby(['Timestamp', 'ShortName'])['Idx'].aggregate(lambda x: set(x))
    # select indexes to drop from each group
    to_drop = []
    for idxs in groups:
        to_drop.extend(drop_using(idxs, merged_on_shortname, merged_on_name))
    # drop selected indexes from dataframe
    merged_on_shortname.drop(to_drop, inplace=True)
        
    return pd.concat([merged_on_name, merged_on_shortname, remaining_null]).sort_values(by=['Timestamp']).reset_index(drop=True)



In [189]:

    
#distributed = assign_station_id(distributed, drop_multinomial)
distributed = assign_station_id(distributed, drop_multinomial)









    



1738 readings could not be merged with the full station name
15 readings could not be merged with the short station name



In [190]:

    
distributed[distributed['Id'].isnull()]['Name'].unique()









    Out[190]:





array([u'Penton Street Depot'], dtype=object)



In [191]:

    
distributed.dropna(inplace=True)

These stations do not exist in our stations dataset so they will be removed.



In [192]:

    
distributed = distributed.set_index(['Id', 'Timestamp']).sort_index()[['NbBikes', 'Name']]



In [193]:

    
distributed.apply(lambda x:x.nunique())









    Out[193]:





NbBikes     23
Name       771
dtype: int64

Collected Cycles



In [194]:

    
collected.describe()









    Out[194]:






  
    
      
      NbBikes
    
  
  
    
      count
      43488.000000
    
    
      mean
      8.946215
    
    
      std
      5.180654
    
    
      min
      0.000000
    
    
      25%
      6.000000
    
    
      50%
      8.000000
    
    
      75%
      12.000000
    
    
      max
      36.000000

Assign Station Id



In [195]:

    
collected = assign_station_id(collected, drop_multinomial)









    



1455 readings could not be merged with the full station name
92 readings could not be merged with the short station name



In [196]:

    
collected[collected['Id'].isnull()]['Name'].unique()









    Out[196]:





array([u'Penton Street Depot'], dtype=object)

These stations do not exist in our stations dataset so they will be removed.



In [197]:

    
collected.dropna(inplace=True)



In [198]:

    
collected = collected.set_index(['Id', 'Timestamp']).sort_index()[['NbBikes', 'Name']]



In [199]:

    
collected.apply(lambda x:x.nunique())









    Out[199]:





NbBikes     26
Name       758
dtype: int64

Build Datasets

Distributed



In [200]:

    
distributed.head()









    Out[200]:






  
    
      
      
      NbBikes
      Name
    
    
      Id
      Timestamp
      
      
    
  
  
    
      BikePoints_1
      2016-01-02 18:19:00+00:00
      6
      River Street, Clerkenwell
    
    
      2016-01-05 10:49:00+00:00
      10
      River Street, Clerkenwell
    
    
      2016-01-06 14:25:00+00:00
      7
      River Street, Clerkenwell
    
    
      2016-01-08 11:14:00+00:00
      12
      River Street, Clerkenwell
    
    
      2016-01-10 11:33:00+00:00
      9
      River Street, Clerkenwell



In [201]:

    
distributed.describe()









    Out[201]:






  
    
      
      NbBikes
    
  
  
    
      count
      39677.000000
    
    
      mean
      9.957658
    
    
      std
      5.457354
    
    
      min
      0.000000
    
    
      25%
      6.000000
    
    
      50%
      9.000000
    
    
      75%
      15.000000
    
    
      max
      36.000000



In [202]:

    
distributed.info(memory_usage='deep')









    



<class 'pandas.core.frame.DataFrame'>
MultiIndex: 39677 entries, (BikePoints_1, 2016-01-02 18:19:00+00:00) to (BikePoints_99, 2016-06-28 17:54:00+00:00)
Data columns (total 2 columns):
NbBikes    39677 non-null uint16
Name       39677 non-null object
dtypes: object(1), uint16(1)
memory usage: 8.2 MB



In [203]:

    
pickle.dump(distributed, open("data/parsed/distributed_dataset_final.p", "wb"))

Collected



In [204]:

    
collected.head()









    Out[204]:






  
    
      
      
      NbBikes
      Name
    
    
      Id
      Timestamp
      
      
    
  
  
    
      BikePoints_1
      2016-01-18 17:12:00+00:00
      9
      River Street, Clerkenwell
    
    
      BikePoints_10
      2016-01-06 08:21:00+00:00
      6
      Park Street, Bankside
    
    
      2016-01-10 02:48:00+00:00
      8
      Park Street, Bankside
    
    
      2016-01-13 08:36:00+00:00
      11
      Park Street, Bankside
    
    
      2016-01-14 08:40:00+00:00
      8
      Park Street, Bankside



In [205]:

    
collected.describe()









    Out[205]:






  
    
      
      NbBikes
    
  
  
    
      count
      43396.000000
    
    
      mean
      8.938888
    
    
      std
      5.170141
    
    
      min
      0.000000
    
    
      25%
      6.000000
    
    
      50%
      8.000000
    
    
      75%
      12.000000
    
    
      max
      36.000000



In [206]:

    
collected.info(memory_usage='deep')









    



<class 'pandas.core.frame.DataFrame'>
MultiIndex: 43396 entries, (BikePoints_1, 2016-01-18 17:12:00+00:00) to (BikePoints_99, 2016-06-30 08:55:00+00:00)
Data columns (total 2 columns):
NbBikes    43396 non-null uint16
Name       43396 non-null object
dtypes: object(1), uint16(1)
memory usage: 8.9 MB



In [207]:

    
pickle.dump(collected, open("data/parsed/collected_dataset_final.p", "wb"))

	NbBikes
count	39692.000000
mean	9.956717
std	5.458785
min	0.000000
25%	6.000000
50%	9.000000
75%	15.000000
max	36.000000

	NbBikes
count	43488.000000
mean	8.946215
std	5.180654
min	0.000000
25%	6.000000
50%	8.000000
75%	12.000000
max	36.000000

		NbBikes	Name
Id	Timestamp
BikePoints_1	2016-01-02 18:19:00+00:00	6	River Street, Clerkenwell
	2016-01-05 10:49:00+00:00	10	River Street, Clerkenwell
	2016-01-06 14:25:00+00:00	7	River Street, Clerkenwell
	2016-01-08 11:14:00+00:00	12	River Street, Clerkenwell
	2016-01-10 11:33:00+00:00	9	River Street, Clerkenwell

	NbBikes
count	39677.000000
mean	9.957658
std	5.457354
min	0.000000
25%	6.000000
50%	9.000000
75%	15.000000
max	36.000000

		NbBikes	Name
Id	Timestamp
BikePoints_1	2016-01-18 17:12:00+00:00	9	River Street, Clerkenwell
BikePoints_10	2016-01-06 08:21:00+00:00	6	Park Street, Bankside
	2016-01-10 02:48:00+00:00	8	Park Street, Bankside
	2016-01-13 08:36:00+00:00	11	Park Street, Bankside
	2016-01-14 08:40:00+00:00	8	Park Street, Bankside

	NbBikes
count	43396.000000
mean	8.938888
std	5.170141
min	0.000000
25%	6.000000
50%	8.000000
75%	12.000000
max	36.000000