notebook.community

Edit and run



In [1]:

    
#change style
import matplotlib.pyplot as plt
plt.style.use("seaborn")
import numpy as np
%matplotlib inline



In [2]:

    
"""#I put all of this in data.py
from urllib.request import urlretrieve
import pandas as pd
import os

#If I want to just refer to it on my computer
my_file = "data.csv"
URL="https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD"

def get_data(filename,url):
    if not os.path.exists(filename):
        print("downloading data")
        urlretrieve(url,filename)
    data=pd.read_csv(my_file, index_col="Date", parse_dates=True) #for time series, make sure that the index is properly set
    data.columns=["West","East"] # so as to shorten the legend
    data["Total"]=data["West"]+data["East"]
    return data

data=get_data(my_file,URL)
"""
from jupyterworkflow.data import get_data   #the python package you created

data=get_data()

we also refactored it for speed by replacing parse_dates. because parse_dates reads the string and tries to infer the structure, it can take forever.

However if I tell explicitely what the structure is, then it's much faster. Lookup "strftime" format

but basically in the package we changed it to data.index=pd.to_datetime(data.index,format="%m/%d/%Y %H:%M:%S %p")



In [3]:

    
data.head()









    Out[3]:






  
    
      
      West
      East
      Total
    
    
      Date
      
      
      
    
  
  
    
      2012-10-03 00:00:00
      4.0
      9.0
      13.0
    
    
      2012-10-03 01:00:00
      4.0
      6.0
      10.0
    
    
      2012-10-03 02:00:00
      1.0
      1.0
      2.0
    
    
      2012-10-03 03:00:00
      2.0
      3.0
      5.0
    
    
      2012-10-03 04:00:00
      6.0
      1.0
      7.0



In [4]:

    
weekly_data=data.resample("W").sum()

weekly_data.plot()









    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x254cb34e438>



In [5]:

    
data.resample("W").sum().plot()









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x254ca961390>



In [6]:

    
#you have data by the hour, so when you say you want it on a daily basis choose if you want a mean or a sum of what happens in a day.
daily_average=data.resample("D").mean() #.head()
daily_total=data.resample("D").sum() #.head()
daily_total_rolling_sum=data.resample("D").sum().rolling(365).sum() #take that daily total and establish a rolling sum 
daily_total_rolling_sum.plot()









    Out[6]:





<matplotlib.axes._subplots.AxesSubplot at 0x254ca8809e8>



In [7]:

    
#setting the y axis

ax=data.resample("D").sum().rolling(365).sum().plot() #notice you are setting this plot as a variable ax
ax.set_ylim(0,None); #saying that y should start at 0



In [8]:

    
print(data.index) #this gives whatever you choose to be your index col
print(data.index.time) #turns that string of time into datetime for the TIME OF DAY ONLY

print(data.groupby(data.index.time).mean().head())









    



DatetimeIndex(['2012-10-03 00:00:00', '2012-10-03 01:00:00',
               '2012-10-03 02:00:00', '2012-10-03 03:00:00',
               '2012-10-03 04:00:00', '2012-10-03 05:00:00',
               '2012-10-03 06:00:00', '2012-10-03 07:00:00',
               '2012-10-03 08:00:00', '2012-10-03 09:00:00',
               ...
               '2017-04-30 14:00:00', '2017-04-30 15:00:00',
               '2017-04-30 16:00:00', '2017-04-30 17:00:00',
               '2017-04-30 18:00:00', '2017-04-30 19:00:00',
               '2017-04-30 20:00:00', '2017-04-30 21:00:00',
               '2017-04-30 22:00:00', '2017-04-30 23:00:00'],
              dtype='datetime64[ns]', name='Date', length=40104, freq=None)
[datetime.time(0, 0) datetime.time(1, 0) datetime.time(2, 0) ...,
 datetime.time(21, 0) datetime.time(22, 0) datetime.time(23, 0)]
              West      East      Total
00:00:00  5.427887  6.833034  12.260922
01:00:00  3.012567  3.503890   6.516457
02:00:00  2.236211  2.172062   4.408273
03:00:00  1.619988  1.378217   2.998205
04:00:00  3.417714  2.572113   5.989826



In [9]:

    
#looking at it intra day
#groupby simply orders them by similar groups. in this case they're grouped based on the time of day
#so data.index.time gets rid of the days part, just keeps time of day. And you group by all those who have the same time of day.
#print(data.groupby(data.index.time).mean().head()) 

data.groupby(data.index.time).mean().plot();



In [10]:

    
#for pivot tables, think of excel pivot tables where you can sort entire table based on column keys
#in pivot_table mode, your columns setting is no longer "extra data", it's just another way of segmenting
#the data you selected under "values"

pivoted=data.pivot_table(values=["Total"], index=data.index.time)
print(pivoted.iloc[:5])

#you can select your aggregation funciton. By default it's always the mean, but here you set it to sum
#basically you have the sum of all the bike rides by each our of everyday.
pivoted=data.pivot_table(values=["Total"], index=data.index.time,aggfunc=[np.sum])
print(pivoted.iloc[:5])

#the columns give you an extra way to segment your data

pivoted=data.pivot_table("Total", index=data.index.time, columns=data.index.date)
print(pivoted.iloc[:5,:5])

#when you plot it, you can think as your index as what's on your X









    



              Total
00:00:00  12.260922
01:00:00   6.516457
02:00:00   4.408273
03:00:00   2.998205
04:00:00   5.989826
              sum
            Total
00:00:00  20488.0
01:00:00  10889.0
02:00:00   7353.0
03:00:00   5010.0
04:00:00  10009.0
          2012-10-03  2012-10-04  2012-10-05  2012-10-06  2012-10-07
00:00:00        13.0        18.0        11.0        15.0        11.0
01:00:00        10.0         3.0         8.0        15.0        17.0
02:00:00         2.0         9.0         7.0         9.0         3.0
03:00:00         5.0         3.0         4.0         3.0         6.0
04:00:00         7.0         8.0         9.0         5.0         3.0



In [11]:

    
pivoted.plot(legend=False,alpha=0.01) #takes a while because you have a line for each day of the day

#so you can see how this would have been different from simply plotting the original data
#where each day is one after the other. here each day is it's own entire line
#plot() will plot each column, here the number of columns is the number of days









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x254ca9c18d0>



In [12]:

    
#you'll notice that most of there's some noise in middle, but most of the time
# the traffic happens during commute times



In [13]:

    
#the following will bring up the """ whatever is here""" that's inside the function 
#
get_data?



In [ ]:

	West	East	Total
Date
2012-10-03 00:00:00	4.0	9.0	13.0
2012-10-03 01:00:00	4.0	6.0	10.0
2012-10-03 02:00:00	1.0	1.0	2.0
2012-10-03 03:00:00	2.0	3.0	5.0
2012-10-03 04:00:00	6.0	1.0	7.0