In [180]:
%matplotlib inline
import logging
import itertools
import json
import os
import re
import pickle
import folium
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from IPython.display import Image
from datetime import date
from src.data.parse_dataset import parse_dir, parse_json_files, get_file_list
from src.data.string_format import format_name, to_short_name
from src.data.visualization import lon_min_longitude, lon_min_latitude, lon_max_longitude, lon_max_latitude, lon_center_latitude, lon_center_longitude, create_london_map
logger = logging.getLogger()
logger.setLevel(logging.INFO)
In [181]:
collected = pd.read_csv('data/raw/redistribution/collected.csv', encoding='latin-1')
distributed = pd.read_csv('data/raw/redistribution/distributed.csv', encoding='latin-1')
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))
In [182]:
# remove null entries, the dataset has very few features to support nulls
collected.dropna(inplace=True)
distributed.dropna(inplace=True)
# convert columns to their appropriate datatypes
collected['NbBikes'] = collected['NbBikes'].astype('uint16')
distributed['NbBikes'] = distributed['NbBikes'].astype('uint16')
# format station name
distributed['Name'] = distributed['Name'].apply(format_name)
collected['Name'] = collected['Name'].apply(format_name)
distributed['Timestamp'] = pd.to_datetime(distributed['Timestamp'], format='%d/%m/%Y %H:%M', errors='raise').dt.tz_localize('UTC')
collected['Timestamp'] = pd.to_datetime(collected['Timestamp'], format='%d/%m/%Y %H:%M', errors='raise').dt.tz_localize('UTC')
In [183]:
distributed.info()
In [184]:
collected.info()
In [185]:
distributed['ShortName'] = distributed['Name'].map(to_short_name)
collected['ShortName'] = collected['Name'].map(to_short_name)
In [186]:
distributed.describe()
Out[186]:
In [187]:
def drop_multinomial(idxs, merged_incorrectly, merged_correctly):
# get the df of the given indexes
df = merged_incorrectly.loc[idxs]
# get the counts of the station ids in the dataset
selector = merged_correctly['Id'].isin(df['Id'])
counts = merged_correctly[selector]['Id'].value_counts()
# choose one station with the multinomial distribution
probs = counts / counts.sum()
multinomial_dist = np.random.multinomial(1, probs)
station_id = counts.index[np.argmax(multinomial_dist)]
# drop the other ones
to_drop_selector = df['Id'] != station_id
to_drop = df[to_drop_selector].index.values
return to_drop
def drop_randomly(idxs, merged_incorrectly=None, merged_correctly=None):
idxs.remove(random.sample(idxs, 1)[0])
return idxs
In [188]:
def split_null(df):
return df[~df['Id'].isnull()].copy(), df[df['Id'].isnull()][['Timestamp', 'Name', 'NbBikes', 'ShortName']].copy()
def assign_station_id(df, drop_using):
# merge using the full station name
merged = pd.merge(df, stations[['Id', 'Name']], how='left', left_on='Name', right_on='Name')
merged_on_name, remaining_null = split_null(merged)
print '%d readings could not be merged with the full station name' % len(remaining_null)
if drop_using is None:
return merged_on_name
# merge using the short name
merged = pd.merge(remaining_null, stations[['Id', 'ShortName']], how='left', left_on='ShortName', right_on='ShortName')
merged_on_shortname, remaining_null = split_null(merged)
print '%d readings could not be merged with the short station name' % len(remaining_null)
# drop duplicate entries from merging by short name
# select the duplicate entries only
selector = merged_on_shortname.duplicated(subset=['Name', 'Timestamp', 'NbBikes'], keep=False)
duplicates = pd.DataFrame(merged_on_shortname[selector])
# add the index as a column for selection
duplicates['Idx'] = duplicates.index
# group the duplicates
groups = duplicates.groupby(['Timestamp', 'ShortName'])['Idx'].aggregate(lambda x: set(x))
# select indexes to drop from each group
to_drop = []
for idxs in groups:
to_drop.extend(drop_using(idxs, merged_on_shortname, merged_on_name))
# drop selected indexes from dataframe
merged_on_shortname.drop(to_drop, inplace=True)
return pd.concat([merged_on_name, merged_on_shortname, remaining_null]).sort_values(by=['Timestamp']).reset_index(drop=True)
In [189]:
#distributed = assign_station_id(distributed, drop_multinomial)
distributed = assign_station_id(distributed, drop_multinomial)
In [190]:
distributed[distributed['Id'].isnull()]['Name'].unique()
Out[190]:
In [191]:
distributed.dropna(inplace=True)
These stations do not exist in our stations dataset so they will be removed.
In [192]:
distributed = distributed.set_index(['Id', 'Timestamp']).sort_index()[['NbBikes', 'Name']]
In [193]:
distributed.apply(lambda x:x.nunique())
Out[193]:
In [194]:
collected.describe()
Out[194]:
In [195]:
collected = assign_station_id(collected, drop_multinomial)
In [196]:
collected[collected['Id'].isnull()]['Name'].unique()
Out[196]:
These stations do not exist in our stations dataset so they will be removed.
In [197]:
collected.dropna(inplace=True)
In [198]:
collected = collected.set_index(['Id', 'Timestamp']).sort_index()[['NbBikes', 'Name']]
In [199]:
collected.apply(lambda x:x.nunique())
Out[199]:
In [200]:
distributed.head()
Out[200]:
In [201]:
distributed.describe()
Out[201]:
In [202]:
distributed.info(memory_usage='deep')
In [203]:
pickle.dump(distributed, open("data/parsed/distributed_dataset_final.p", "wb"))
In [204]:
collected.head()
Out[204]:
In [205]:
collected.describe()
Out[205]:
In [206]:
collected.info(memory_usage='deep')
In [207]:
pickle.dump(collected, open("data/parsed/collected_dataset_final.p", "wb"))