JupyterWorkflow3

From exploratory analysis to reproducible research

Mehmetcan Budak



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("seaborn")



In [2]:

    
from jupyterworkflow.data import get_fremont_data



In [3]:

    
data = get_fremont_data()
data.head()









    Out[3]:







  
    
      
      West
      East
      Total
    
    
      Date
      
      
      
    
  
  
    
      2012-10-03 00:00:00
      4.0
      9.0
      13.0
    
    
      2012-10-03 01:00:00
      4.0
      6.0
      10.0
    
    
      2012-10-03 02:00:00
      1.0
      1.0
      2.0
    
    
      2012-10-03 03:00:00
      2.0
      3.0
      5.0
    
    
      2012-10-03 04:00:00
      6.0
      1.0
      7.0



In [4]:

    
data.resample("W").sum().plot()









    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x10a8d3ef0>



In [5]:

    
data.groupby(data.index.time).mean().plot()









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x1030b79e8>



In [6]:

    
pivoted = data.pivot_table("Total", index=data.index.time, columns=data.index.date)
pivoted.iloc[:5, :5]



In [7]:

    
pivoted.plot(legend=False, alpha=0.01)









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x10b480da0>

SECOND PART To make a python package so we and other people can use it for analysis.

Go to the directory

mkdir jupyterworkflow create a directory touch jupyterworkflow/init.py initialize a python package create a data.py in this directory.

import os from urllib.request import urlretrieve

import pandas as pd

FREMONT_URL = "https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD" /# create a function to only dowload this data if we need to download it, first run..

def get_fremont_data(filename="Fremont.csv", url=FREMONT_URL, force_download=False): """Download and cache the fremont data

Parameters
----------
filename :string (optional)
    loation to save the data
url: string (optional)
    web location of the data
force_download: bool (optional)
    if True, force redownload of data

Returns
-------
data: pandas.DataFrame
    The fremont bridge data
"""
if force_download or not os.path.exists(filename):
    urlretrieve(url, filename)
data = pd.read_csv("Fremont.csv", index_col="Date", parse_dates=True)
data.columns = ["West", "East"]
data["Total"] = data["West"] + data["East"]
return data



In [8]:

    
#get_fremont_data?

Nice time to test the tools to see they are doing what we want to do Unit tests



In [ ]:

	West	East	Total
Date
2012-10-03 00:00:00	4.0	9.0	13.0
2012-10-03 01:00:00	4.0	6.0	10.0
2012-10-03 02:00:00	1.0	1.0	2.0
2012-10-03 03:00:00	2.0	3.0	5.0
2012-10-03 04:00:00	6.0	1.0	7.0

	2012-10-03	2012-10-04	2012-10-05	2012-10-06	2012-10-07
00:00:00	13.0	18.0	11.0	15.0	11.0
01:00:00	10.0	3.0	8.0	15.0	17.0
02:00:00	2.0	9.0	7.0	9.0	3.0
03:00:00	5.0	3.0	4.0	3.0	6.0
04:00:00	7.0	8.0	9.0	5.0	3.0