In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
In [4]:
weight_gurus = pd.read_csv('data/raw/weight-gurus-history.csv')
weight_gurus.head()
Out[4]:
In [5]:
weight_gurus['Weight (lb)'].plot()
Out[5]:
In [6]:
mfp = pd.read_csv('data/raw/myfitnesspal-export.csv')
mfp = mfp.dropna(subset=['Weight'])
mfp.head()
Out[6]:
In [7]:
mfp.Weight.plot()
Out[7]:
In [8]:
strava = pd.read_csv('data/raw/strava-activities.csv')
strava = strava[strava['Activity Type'] == "Run"]
strava.head()
Out[8]:
In [9]:
runkeeper = pd.read_csv('data/raw/runkeeper-activities.csv')
runkeeper = runkeeper[runkeeper['Type'] == "Running"]
runkeeper.head()
Out[9]:
In [10]:
from datetime import datetime
from datetime import time
In [11]:
weight_gurus_dt_format = "%b %d %Y %I:%M:%S %p"
mfp_dt_format = "%Y-%m-%d"
strava_dt_format = "%b %d, %Y, %I:%M:%S %p"
runkeeper_dt_format = "%Y-%m-%d %H:%M:%S"
In [12]:
weight_gurus = weight_gurus.rename(columns={'Date/Time': 'Date'})
weight_gurus['Date'] = weight_gurus['Date'].apply(lambda x: datetime.strptime(x, weight_gurus_dt_format))
In [13]:
mfp['Date'] = mfp['Date'].apply(lambda x: datetime.strptime(x, mfp_dt_format))
In [14]:
strava = strava.rename(columns={'Activity Date': 'Date'})
strava['Date'] = strava['Date'].apply(lambda x: datetime.strptime(x, strava_dt_format))
In [15]:
runkeeper['Date'] = runkeeper['Date'].apply(lambda x: datetime.strptime(x, runkeeper_dt_format))
In [16]:
weight_gurus = weight_gurus.rename(columns={'Weight (lb)': 'Weight'})
In [17]:
weight_cols = ['Date', 'Weight']
weight_df = pd.concat([
mfp[weight_cols],
weight_gurus[weight_cols]
])
weight_df = weight_df.sort_values('Date')
weight_df.head()
Out[17]:
In [18]:
# Convert km -> mi
strava['Distance'] = strava['Distance'] * 0.621371
In [19]:
# Calculate pace (in decimal minutes)
strava['Pace_min'] = strava['Elapsed Time'] / (60*strava['Distance'])
In [20]:
# Calculate duration (in decimal minutes)
strava['Duration_min'] = strava['Elapsed Time']/60.0
In [21]:
from math import floor
def decimal_minute_to_time(dec_minutes):
"""Converts decimal minutes to MM:SS format.
Parameters
----------
dec_minutes : float
Time in minutes
Returns
-------
str
"""
hour = floor(dec_minutes / 60)
minute = int(dec_minutes % 60)
sec = int(60 * (dec_minutes - int(dec_minutes)))
time_str = ""
if hour > 0:
time_str = "{}:{:02}:{:02}".format(hour, minute, sec)
else:
time_str = "{}:{:02}".format(minute, sec)
return time_str
def time_to_decimal_minute(time_str):
"""Converts MM:SS or HH:MM:SS string to decimal minute format.
Parameters
----------
time_str : str
Time in "MM:SS" or "HH:MM:SS" format
Returns
-------
float
Raises
------
ValueError
For poorly formatted string.
"""
time_list = time_str.split(":")
minute, second = int(time_list[-2]), int(time_list[-1])
if len(time_list) == 3:
minute = minute + 60.0 * int(time_list[0])
if second >= 60:
raise ValueError("Bad time string format. More than 60s: %s", second)
dec_minute = minute + second/60.0
return dec_minute
In [22]:
decimal_minute_to_time(125.5)
Out[22]:
In [23]:
time_to_decimal_minute("2:05:30")
Out[23]:
In [24]:
# Convert decimal minute to MM:SS
strava['Pace'] = strava['Pace_min'].apply(decimal_minute_to_time)
strava['Duration'] = strava['Duration_min'].apply(decimal_minute_to_time)
In [25]:
strava = strava.rename(columns={'Activity Name': 'Name',
'Activity Description': 'Description'})
In [26]:
strava['Tracker'] = 'Strava'
In [27]:
strava.head()
Out[27]:
In [28]:
runkeeper = runkeeper.rename(columns={'Distance (mi)': 'Distance',
'Notes': 'Name',
'Average Pace': 'Pace'})
In [29]:
runkeeper['Pace_min'] = runkeeper['Pace'].apply(time_to_decimal_minute)
runkeeper['Duration_min'] = runkeeper['Duration'].apply(time_to_decimal_minute)
In [30]:
runkeeper['Description'] = None
runkeeper['Tracker'] = "Runkeeper"
In [31]:
runkeeper.head()
Out[31]:
In [32]:
run_cols = ['Date', 'Name', 'Description', 'Distance', 'Pace',
'Duration', 'Pace_min', 'Duration_min', 'Tracker']
In [33]:
run_df = pd.concat([strava[run_cols], runkeeper[run_cols]])
In [34]:
run_df.head()
Out[34]:
In [44]:
WG_DT_FORMAT = "%b %d %Y %I:%M:%S %p"
MFP_DT_FORMAT = "%Y-%m-%d"
RUNKEEPER_DT_FORMAT = "%Y-%m-%d %H:%M:%S"
STRAVA_DT_FORMAT = "%b %d, %Y, %I:%M:%S %p"
WEIGHT_COLS = ["Date", "Weight"]
RUN_COLS = ['Date', 'Name', 'Description', 'Distance', 'Pace',
'Duration', 'Pace_min', 'Duration_min', 'Tracker']
In [45]:
def process_weight_gurus(wg_filename):
weight_gurus = pd.read_csv(wg_filename)
weight_gurus = weight_gurus.rename(columns={'Date/Time': 'Date'})
weight_gurus['Date'] = weight_gurus['Date'].apply(
lambda x: datetime.strptime(x, WG_DT_FORMAT)
)
weight_gurus = weight_gurus.rename(
columns={'Weight (lb)': 'Weight'}
)
return weight_gurus
In [46]:
def process_mfp_weight(mfp_filename):
mfp = pd.read_csv(mfp_filename)
mfp = mfp.dropna(subset=['Weight'])
mfp['Date'] = mfp['Date'].apply(
lambda x: datetime.strptime(x, MFP_DT_FORMAT)
)
return mfp
In [47]:
def process_runkeeper(runkeeper_filename):
runkeeper = pd.read_csv(runkeeper_filename)
runkeeper = runkeeper[runkeeper['Type'] == "Running"]
runkeeper['Date'] = runkeeper['Date'].apply(
lambda x: datetime.strptime(x, RUNKEEPER_DT_FORMAT)
)
runkeeper = runkeeper.rename(columns={'Distance (mi)': 'Distance',
'Notes': 'Name',
'Average Pace': 'Pace'})
runkeeper['Pace_min'] = runkeeper['Pace'].apply(time_to_decimal_minute)
runkeeper['Duration_min'] = runkeeper['Duration'].apply(time_to_decimal_minute)
runkeeper['Description'] = None
runkeeper['Tracker'] = "Runkeeper"
return runkeeper
In [48]:
def process_strava(strava_filename, dt_format=STRAVA_DT_FORMAT):
# Load and filter to only running activities
strava = pd.read_csv(strava_filename)
strava = strava[strava['Activity Type'] == "Run"]
# Rename the features for consistency
strava = strava.rename(columns={'Activity Date': 'Date',
'Activity Name': 'Name',
'Activity Description': 'Description'})
# Turn Date into datetime type
strava['Date'] = strava['Date'].apply(
lambda x: datetime.strptime(x, dt_format)
)
# Convert km -> mi
strava['Distance'] = strava['Distance'] * 0.621371
# Calculate pace (in decimal minutes)
strava['Pace_min'] = strava['Elapsed Time'] / (60 * strava['Distance'])
# Calculate duration (in decimal minutes)
strava['Duration_min'] = strava['Elapsed Time']/60.0
# Convert decimal minute to MM:SS
strava['Pace'] = strava['Pace_min'].apply(decimal_minute_to_time)
strava['Duration'] = strava['Duration_min'].apply(decimal_minute_to_time)
# Tag each row with the tracker it came from
strava['Tracker'] = 'Strava'
return strava
In [49]:
def combine_weights(df_list, weight_cols=WEIGHT_COLS):
weight_df = pd.concat([df[weight_cols] for df in df_list])
weight_df = weight_df.sort_values('Date')
return weight_df
In [50]:
def combine_runs(df_list, run_cols=RUN_COLS):
run_df = pd.concat([df[run_cols] for df in df_list])
run_df = run_df.sort_values('Date')
return run_df
In [51]:
def main():
strava = process_strava('data/raw/strava-activities.csv')
runkeeper = process_runkeeper('data/raw/runkeeper-activities.csv')
mfp = process_mfp_weight('data/raw/myfitnesspal-export.csv')
weight_gurus = process_weight_gurus('data/raw/weight-gurus-history.csv')
run_df = combine_runs([strava, runkeeper])
weight_df = combine_weights([mfp, weight_gurus])
run_df.to_csv('data/processed/run.csv')
weight_df.to_csv('data/processed/weight.csv')
In [52]:
if __name__ == "__main__":
print("processing data beep boop bonk")
main()