In [1]:
    
from preamble import *
% matplotlib notebook
    
In [2]:
    
from glob import glob
dfs = []
for file in glob("data/citibike/*.csv"):
    dfs.append(pd.read_csv(file))
    
In [3]:
    
data = pd.concat(dfs)
    
In [4]:
    
data.columns
    
    Out[4]:
In [5]:
    
data.head()
    
    Out[5]:
In [6]:
    
data['one'] = 1
data['starttime'] = pd.to_datetime(data.starttime)
data = data.set_index("starttime")
    
In [7]:
    
data_resampled = data.groupby("start station id").one.resample("3h").sum()
    
In [8]:
    
per_station = data_resampled.unstack(level=0).fillna(0)
    
In [9]:
    
plt.figure()
per_station[301].plot()
    
    
    
    Out[9]:
In [10]:
    
from sklearn.gaussian_process import GaussianProcessRegressor
    
In [11]:
    
y = per_station[301].values
X = np.arange(len(y)).reshape(-1, 1)
    
In [12]:
    
gp = GaussianProcessRegressor().fit(X, y)
    
In [13]:
    
plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()
    
    
    
    Out[13]:
In [14]:
    
gp.kernel_
    
    Out[14]:
In [18]:
    
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, WhiteKernel
gp = GaussianProcessRegressor(alpha=1, normalize_y=True,
                              kernel = 1.0 * RBF(length_scale_bounds=(2, 500)) + 1.0 * RBF(length_scale_bounds=(50, 1000))
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=8, periodicity_bounds="fixed")  # + 1.0 * WhiteKernel(noise_level=1)
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=56, periodicity_bounds="fixed")).fit(X[:1500], y[:1500])
    
In [20]:
    
plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()
    
    
    
    Out[20]:
In [21]:
    
gp.kernel_
    
    Out[21]: