In [1]:
import pandas as pd
import os
import numpy as np
import datetime as dt
I've registered earlier to Leada's Data Literacy, as it looked pretty fun (and it was :-) )
Email received from Leada :
You just made your 2015 New Year's resolution to improve your data analysis skills and become data literate!
Every two weeks, we will e-mail you a dataset and some problems to work on. You can use any tool you'd like, although we suggest using R or Python. R is easy to install and if you are new to programming, the Introduction to R lesson is free in our courses!
Here's the first dataset and problem! Answers will be released at a later date. Tweet your solution @LeadaHQ!
====================
Dataset Description: A CSV of bicycle rental transactions for a rental company in California.
In [2]:
# Load & Parse dates
df = pd.read_csv('./data/bike_trip_data.csv', parse_dates=[2,5] )
In [3]:
avg_duration_mins, avg_duration_secs = divmod(df['Duration'].mean(), 60)
print 'The average total time is {:g} minutes and {:.0f} seconds'.format(avg_duration_mins, avg_duration_secs)
In [4]:
# Create a new column containing the day
df['Day'] = df['Start Date'].apply(lambda x: x.date())
In [5]:
# Grouping by Day, counting any other column
dfq2 = df.groupby('Day')[['Duration']].count()
dfq2.columns = ['Nb Trips']
dfq2 = dfq2.sort(columns='Nb Trips', ascending = False)
dfq2.head(1)
Out[5]:
In [6]:
# Curiosity check : how many unique terminals are there
all_terminals = set(df['Start Terminal'])
print 'There is {} distinct terminals'.format(len(all_terminals))
In [7]:
# With pivot tables, create two dataframes that lists the changesin inventory
# Taken are the bikes leaving terminals, back are taken back to the terminal
taken = df.pivot_table(index=df.index, columns=['Start Terminal'], values=['Duration'], aggfunc='count').fillna(0) * -1
back = df.pivot_table(index=df.index, columns=['End Terminal'], values=['Duration'], aggfunc='count').fillna(0)
In [8]:
# We combine bikes inflows & outflows, and create a cumulative sum
total = taken + back
total = total.cumsum()['Duration']
In [9]:
# 5 last lines of the dataframe
total.tail()
Out[9]:
In [10]:
# Now we create a dictionary, that will hold the final result :
# - Key : terminal number
# - Value : index of the time that requires rebalancing
#
# A terminal requires rebalacing the first time it reaches -30 bikes
index_stamps = {}
for col in total.columns:
stamp = None
stamp = total[col].loc[total[col] == -30].head(1).index.values
if stamp.size == 0:
index_stamps[col] = None
else:
index_stamps[col] = [stamp[0]]
# Now we fetch the actual datetime in the original dataframe
for terminal, rebal_list in index_stamps.items():
if rebal_list:
index = rebal_list[0]
rebal_list.append(df.loc[index]['Start Date'])
In [11]:
# Example for terminal 2
index_stamps[2]
Out[11]:
In [12]:
# Displaying the result
for terminal, rebal_list in index_stamps.items():
if rebal_list:
print 'Terminal {term} needs rebalancing on {date}'.format(term=terminal, date=rebal_list[1])