JupyterWorkflow2

From exploratory analysis to reproducible research

Mehmetcan Budak


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("seaborn")

In [2]:
import os
from urllib.request import urlretrieve

import pandas as pd

URL = "https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD"
#create a function to only dowload this data if we need to download it, first run..

def get_fremont_data(filename="Fremont.csv", url=URL, force_download=False):
    if force_download or not os.path.exists(filename):
        urlretrieve(url, filename)
    data = pd.read_csv("Fremont.csv", index_col="Date", parse_dates=True)
    data.columns = ["West", "East"]
    data["Total"] = data["West"] + data["East"]
    return data

In [3]:
data = get_fremont_data()
data.head()


Out[3]:
West East Total
Date
2012-10-03 00:00:00 4.0 9.0 13.0
2012-10-03 01:00:00 4.0 6.0 10.0
2012-10-03 02:00:00 1.0 1.0 2.0
2012-10-03 03:00:00 2.0 3.0 5.0
2012-10-03 04:00:00 6.0 1.0 7.0

In [4]:
data.resample("W").sum().plot()


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c1fd240>

In [5]:
data.groupby(data.index.time).mean().plot()


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c8862e8>

In [6]:
pivoted = data.pivot_table("Total", index=data.index.time, columns=data.index.date)
pivoted.iloc[:5, :5]


Out[6]:
2012-10-03 2012-10-04 2012-10-05 2012-10-06 2012-10-07
00:00:00 13.0 18.0 11.0 15.0 11.0
01:00:00 10.0 3.0 8.0 15.0 17.0
02:00:00 2.0 9.0 7.0 9.0 3.0
03:00:00 5.0 3.0 4.0 3.0 6.0
04:00:00 7.0 8.0 9.0 5.0 3.0

In [7]:
pivoted.plot(legend=False, alpha=0.01)


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cf5fc50>

SECOND PART To make a python package so we and other people can use it for analysis.

Go to the directory

mkdir jupyterworkflow create a directory touch jupyterworkflow/init.py initialize a python package create a data.py in this directory.

import os from urllib.request import urlretrieve

import pandas as pd

FREMONT_URL = "https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD" /# create a function to only dowload this data if we need to download it, first run..

def get_fremont_data(filename="Fremont.csv", url=FREMONT_URL, force_download=False): """Download and cache the fremont data

Parameters
----------
filename :string (optional)
    loation to save the data
url: string (optional)
    web location of the data
force_download: bool (optional)
    if True, force redownload of data

Returns
-------
data: pandas.DataFrame
    The fremont bridge data
"""
if force_download or not os.path.exists(filename):
    urlretrieve(url, filename)
data = pd.read_csv("Fremont.csv", index_col="Date", parse_dates=True)
data.columns = ["West", "East"]
data["Total"] = data["West"] + data["East"]
return data

In [ ]: