In [1]:
%matplotlib inline
# %config InlineBackend.figure_format='retina'
import dask.dataframe as dd
import numpy as np
import pandas as pd
# import seaborn as sgn
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt

In [12]:
plt.rcParams['savefig.dpi'] = 125

In [2]:
# df = dd.read_parquet('/bigdata/citibike.parquet')
# df.set_index('start_time', npartitions='auto').to_parquet('/bigdata/citibike_repartitioned.parquet')

In [22]:
df = dd.read_parquet('/bigdata/citibike_repartitioned.parquet')

In [23]:
def get_is_bus_day(x):
    return np.is_busday(x.index.values.astype('<M8[D]'))

def get_day(x):
    return pd.to_datetime(x.index.values.astype('<M8[D]').astype(str), infer_datetime_format=True)

def get_week(x):
    return pd.to_datetime(x.index.values.astype('<M8[W]').astype(str), infer_datetime_format=True)


df['year_month_day'] = (df.map_partitions(get_day, meta=('year_month_day', '<M8[ns]')))
df['year_month_week'] = (df.map_partitions(get_week, meta=('year_month_week', '<M8[ns]')))
df['is_bus_day'] = df.map_partitions(get_is_bus_day, meta=('is_bus_day', np.bool))

In [24]:
df[df.is_bus_day == False].head()


Out[24]:
trip_duration stop_time start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bike_id user_type birth_year gender start_taxizone_id end_taxizone_id year_month_day year_month_week is_bus_day
start_time
2013-07-06 00:00:43 1593 2013-07-06 00:27:16 224 Spruce St & Nassau St 40.711464 -74.005524 232 Cadman Plaza E & Tillary St 40.695977 -73.990149 15355 Customer NaN 0 209.0 65.0 2013-07-06 2013-07-04 False
2013-07-06 00:00:49 173 2013-07-06 00:03:42 482 W 15 St & 7 Ave 40.739355 -73.999318 482 W 15 St & 7 Ave 40.739355 -73.999318 14850 Subscriber 1958.0 1 90.0 90.0 2013-07-06 2013-07-04 False
2013-07-06 00:00:56 1494 2013-07-06 00:25:50 282 Kent Ave & S 11 St 40.708273 -73.968341 539 Metropolitan Ave & Bedford Ave 40.715348 -73.960241 16233 Customer NaN 0 256.0 255.0 2013-07-06 2013-07-04 False
2013-07-06 00:01:08 938 2013-07-06 00:16:46 164 E 47 St & 2 Ave 40.753231 -73.970325 2022 E 59 St & Sutton Pl 40.758491 -73.959206 20186 Customer NaN 0 233.0 140.0 2013-07-06 2013-07-04 False
2013-07-06 00:01:10 349 2013-07-06 00:06:59 229 Great Jones St 40.727434 -73.993790 428 E 3 St & 1 Ave 40.724677 -73.987834 15735 Subscriber 1986.0 1 114.0 79.0 2013-07-06 2013-07-04 False

In [26]:
timedf = df['trip_duration year_month_day'.split()].groupby('year_month_day').count().compute()
timedf_wk = df['trip_duration year_month_week'.split()].groupby('year_month_week').count().compute()

In [58]:
timedf2 = df['trip_duration year_month_day'.split()][df.user_type == 'Customer'].groupby('year_month_day').count().compute()
timedf2_wk = df['trip_duration year_month_week'.split()][df.user_type=='Customer'].groupby('year_month_week').count().compute()

In [60]:
timedf3 = df['trip_duration year_month_day'.split()][df.user_type == 'Subscriber'].groupby('year_month_day').count().compute()
timedf3_wk = df['trip_duration year_month_week'.split()][df.user_type=='Subscriber'].groupby('year_month_week').count().compute()

In [7]:
import matplotlib.dates as mdates

In [100]:
plt.plot(timedf.index.values, timedf.trip_duration, alpha=0.4, color='C0', label='Trips per day')
plt.plot(timedf_wk.index.values, timedf_wk.trip_duration/7., color='C0', alpha=1, label='One week average of trips per day')

plt.plot(timedf2.index.values, timedf2.trip_duration, alpha=0.4, color='C1', label='Guests Trips per day')
plt.plot(timedf2_wk.index.values, timedf2_wk.trip_duration/7., color='C1', alpha=1, label='Guests One week average of trips per day')

plt.plot(timedf3.index.values, timedf3.trip_duration, alpha=0.4, color='C2', label='Subscriber Trips per day')
plt.plot(timedf3_wk.index.values, timedf3_wk.trip_duration/7., color='C2', alpha=1, label='Subscriber One week average of trips per day')

plt.xlabel('Date')
plt.ylabel("Trips Per Day")
plt.gcf().set_size_inches(10, 5)

# years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
yearsFmt = mdates.DateFormatter('%b %Y')

ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
ax.xaxis.set_minor_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(yearsFmt)

ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(10000))
ax.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(2500))

plt.legend(loc='upper left')

plt.xlim('2013-07-01', '2017-01-01')
# plt.xlim('2016-06-01', '2016-08-01')
# ax.xaxis.set_major_locator(mdates.WeekdayLocator())
# ax.xaxis.set_minor_locator(mdates.WeekdayLocator())
# ax.xaxis.set_major_formatter(yearsFmt)


plt.ylim(0, 75000)
# ax.xaxis.set_minor_locator(months)

plt.gcf().autofmt_xdate()



In [44]:
weather_df = pd.read_csv('../central_park_weather.csv.gz')

In [53]:
weather_df['DATE'] = pd.to_datetime(weather_df.DATE, format='%Y%m%d')

In [64]:
merged_counts = timedf.merge(weather_df['DATE PRCP SNWD SNOW TMAX TMIN AWND'.split()], left_index=True, right_on='DATE', how='left')

In [73]:
merged_counts.columns = ['N', 'DATE', 'PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN',
       'AWND']

In [81]:
import statsmodels.api
import statsmodels.formula.api
from patsy import standardize as st

In [91]:
model = statsmodels.formula.api.ols(formula="N ~ st(PRCP) + st(SNWD) + st(SNOW) + st(TMAX) + st(TMIN)", data=merged_counts)
model.fit().summary()


Out[91]:
OLS Regression Results
Dep. Variable: N R-squared: 0.549
Model: OLS Adj. R-squared: 0.547
Method: Least Squares F-statistic: 308.9
Date: Fri, 19 May 2017 Prob (F-statistic): 1.81e-216
Time: 15:09:19 Log-Likelihood: -13493.
No. Observations: 1276 AIC: 2.700e+04
Df Residuals: 1270 BIC: 2.703e+04
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 2.892e+04 265.571 108.898 0.000 2.84e+04 2.94e+04
st(PRCP) -3210.2392 273.517 -11.737 0.000 -3746.835 -2673.644
st(SNWD) -2102.8777 299.407 -7.023 0.000 -2690.265 -1515.490
st(SNOW) -182.5400 284.662 -0.641 0.521 -741.000 375.920
st(TMAX) 8357.3333 1012.308 8.256 0.000 6371.353 1.03e+04
st(TMIN) 422.5314 1020.685 0.414 0.679 -1579.882 2424.945
Omnibus: 126.581 Durbin-Watson: 0.431
Prob(Omnibus): 0.000 Jarque-Bera (JB): 168.763
Skew: 0.797 Prob(JB): 2.26e-37
Kurtosis: 3.798 Cond. No. 8.23

In [95]:
model = statsmodels.formula.api.ols(formula="N ~ st(PRCP) + st(SNWD) + st(SNOW) + st(TMAX) + st(TMIN)", data=merged_counts)
model.fit().summary()


Out[95]:
OLS Regression Results
Dep. Variable: N R-squared: 0.549
Model: OLS Adj. R-squared: 0.547
Method: Least Squares F-statistic: 308.9
Date: Fri, 19 May 2017 Prob (F-statistic): 1.81e-216
Time: 15:10:17 Log-Likelihood: -13493.
No. Observations: 1276 AIC: 2.700e+04
Df Residuals: 1270 BIC: 2.703e+04
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 2.892e+04 265.571 108.898 0.000 2.84e+04 2.94e+04
st(PRCP) -3210.2392 273.517 -11.737 0.000 -3746.835 -2673.644
st(SNWD) -2102.8777 299.407 -7.023 0.000 -2690.265 -1515.490
st(SNOW) -182.5400 284.662 -0.641 0.521 -741.000 375.920
st(TMAX) 8357.3333 1012.308 8.256 0.000 6371.353 1.03e+04
st(TMIN) 422.5314 1020.685 0.414 0.679 -1579.882 2424.945
Omnibus: 126.581 Durbin-Watson: 0.431
Prob(Omnibus): 0.000 Jarque-Bera (JB): 168.763
Skew: 0.797 Prob(JB): 2.26e-37
Kurtosis: 3.798 Cond. No. 8.23

In [68]:
help(statsmodels.api.OLS)


Help on class OLS in module statsmodels.regression.linear_model:

class OLS(WLS)
 |  A simple ordinary least squares model.
 |  
 |  
 |  Parameters
 |  ----------
 |  endog : array-like
 |      1-d endogenous response variable. The dependent variable.
 |  exog : array-like
 |      A nobs x k array where `nobs` is the number of observations and `k`
 |      is the number of regressors. An intercept is not included by default
 |      and should be added by the user. See
 |      :func:`statsmodels.tools.add_constant`.
 |  missing : str
 |      Available options are 'none', 'drop', and 'raise'. If 'none', no nan
 |      checking is done. If 'drop', any observations with nans are dropped.
 |      If 'raise', an error is raised. Default is 'none.'
 |  hasconst : None or bool
 |      Indicates whether the RHS includes a user-supplied constant. If True,
 |      a constant is not checked for and k_constant is set to 1 and all
 |      result statistics are calculated as if a constant is present. If
 |      False, a constant is not checked for and k_constant is set to 0.
 |  
 |  
 |  Attributes
 |  ----------
 |  weights : scalar
 |      Has an attribute weights = array(1.0) due to inheritance from WLS.
 |  
 |  See Also
 |  --------
 |  GLS
 |  
 |  Examples
 |  --------
 |  >>> import numpy as np
 |  >>>
 |  >>> import statsmodels.api as sm
 |  >>>
 |  >>> Y = [1,3,4,5,2,3,4]
 |  >>> X = range(1,8)
 |  >>> X = sm.add_constant(X)
 |  >>>
 |  >>> model = sm.OLS(Y,X)
 |  >>> results = model.fit()
 |  >>> results.params
 |  array([ 2.14285714,  0.25      ])
 |  >>> results.tvalues
 |  array([ 1.87867287,  0.98019606])
 |  >>> print(results.t_test([1, 0])))
 |  <T test: effect=array([ 2.14285714]), sd=array([[ 1.14062282]]), t=array([[ 1.87867287]]), p=array([[ 0.05953974]]), df_denom=5>
 |  >>> print(results.f_test(np.identity(2)))
 |  <F test: F=array([[ 19.46078431]]), p=[[ 0.00437251]], df_denom=5, df_num=2>
 |  
 |  Notes
 |  -----
 |  No constant is added by the model unless you are using formulas.
 |  
 |  Method resolution order:
 |      OLS
 |      WLS
 |      RegressionModel
 |      statsmodels.base.model.LikelihoodModel
 |      statsmodels.base.model.Model
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, endog, exog=None, missing='none', hasconst=None, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  loglike(self, params)
 |      The likelihood function for the clasical OLS model.
 |      
 |      Parameters
 |      ----------
 |      params : array-like
 |          The coefficients with which to estimate the log-likelihood.
 |      
 |      Returns
 |      -------
 |      The concentrated likelihood function evaluated at params.
 |  
 |  whiten(self, Y)
 |      OLS model whitener does nothing: returns Y.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from RegressionModel:
 |  
 |  fit(self, method='pinv', cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs)
 |      Full fit of the model.
 |      
 |      The results include an estimate of covariance matrix, (whitened)
 |      residuals and an estimate of scale.
 |      
 |      Parameters
 |      ----------
 |      method : str
 |          Can be "pinv", "qr".  "pinv" uses the Moore-Penrose pseudoinverse
 |          to solve the least squares problem. "qr" uses the QR
 |          factorization.
 |      
 |      Returns
 |      -------
 |      A RegressionResults class instance.
 |      
 |      See Also
 |      ---------
 |      regression.RegressionResults
 |      
 |      Notes
 |      -----
 |      The fit method uses the pseudoinverse of the design/exogenous variables
 |      to solve the least squares minimization.
 |  
 |  fit_regularized(self, method='coord_descent', maxiter=1000, alpha=0.0, L1_wt=1.0, start_params=None, cnvrg_tol=1e-08, zero_tol=1e-08, **kwargs)
 |      Return a regularized fit to a linear regression model.
 |      
 |      Parameters
 |      ----------
 |      method : string
 |          Only the coordinate descent algorithm is implemented.
 |      maxiter : integer
 |          The maximum number of iteration cycles (an iteration cycle
 |          involves running coordinate descent on all variables).
 |      alpha : scalar or array-like
 |          The penalty weight.  If a scalar, the same penalty weight
 |          applies to all variables in the model.  If a vector, it
 |          must have the same length as `params`, and contains a
 |          penalty weight for each coefficient.
 |      L1_wt : scalar
 |          The fraction of the penalty given to the L1 penalty term.
 |          Must be between 0 and 1 (inclusive).  If 0, the fit is
 |          ridge regression.  If 1, the fit is the lasso.
 |      start_params : array-like
 |          Starting values for ``params``.
 |      cnvrg_tol : scalar
 |          If ``params`` changes by less than this amount (in sup-norm)
 |          in once iteration cycle, the algorithm terminates with
 |          convergence.
 |      zero_tol : scalar
 |          Any estimated coefficient smaller than this value is
 |          replaced with zero.
 |      
 |      Returns
 |      -------
 |      A RegressionResults object, of the same type returned by
 |      ``fit``.
 |      
 |      Notes
 |      -----
 |      The approach closely follows that implemented in the glmnet
 |      package in R.  The penalty is the "elastic net" penalty, which
 |      is a convex combination of L1 and L2 penalties.
 |      
 |      The function that is minimized is: ..math::
 |      
 |          0.5*RSS/n + alpha*((1-L1_wt)*|params|_2^2/2 + L1_wt*|params|_1)
 |      
 |      where RSS is the usual regression sum of squares, n is the
 |      sample size, and :math:`|*|_1` and :math:`|*|_2` are the L1 and L2
 |      norms.
 |      
 |      Post-estimation results are based on the same data used to
 |      select variables, hence may be subject to overfitting biases.
 |      
 |      References
 |      ----------
 |      Friedman, Hastie, Tibshirani (2008).  Regularization paths for
 |      generalized linear models via coordinate descent.  Journal of
 |      Statistical Software 33(1), 1-22 Feb 2010.
 |  
 |  initialize(self)
 |      Initialize (possibly re-initialize) a Model instance. For
 |      instance, the design matrix of a linear model may change
 |      and some things must be recomputed.
 |  
 |  predict(self, params, exog=None)
 |      Return linear predicted values from a design matrix.
 |      
 |      Parameters
 |      ----------
 |      params : array-like
 |          Parameters of a linear model
 |      exog : array-like, optional.
 |          Design / exogenous data. Model exog is used if None.
 |      
 |      Returns
 |      -------
 |      An array of fitted values
 |      
 |      Notes
 |      -----
 |      If the model has not yet been fit, params is not optional.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from RegressionModel:
 |  
 |  df_model
 |      The model degree of freedom, defined as the rank of the regressor
 |      matrix minus 1 if a constant is included.
 |  
 |  df_resid
 |      The residual degree of freedom, defined as the number of observations
 |      minus the rank of the regressor matrix.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from statsmodels.base.model.LikelihoodModel:
 |  
 |  hessian(self, params)
 |      The Hessian matrix of the model
 |  
 |  information(self, params)
 |      Fisher information matrix of model
 |      
 |      Returns -Hessian of loglike evaluated at params.
 |  
 |  score(self, params)
 |      Score vector of model.
 |      
 |      The gradient of logL with respect to each parameter.
 |  
 |  ----------------------------------------------------------------------
 |  Class methods inherited from statsmodels.base.model.Model:
 |  
 |  from_formula(formula, data, subset=None, *args, **kwargs) from builtins.type
 |      Create a Model from a formula and dataframe.
 |      
 |      Parameters
 |      ----------
 |      formula : str or generic Formula object
 |          The formula specifying the model
 |      data : array-like
 |          The data for the model. See Notes.
 |      subset : array-like
 |          An array-like object of booleans, integers, or index values that
 |          indicate the subset of df to use in the model. Assumes df is a
 |          `pandas.DataFrame`
 |      args : extra arguments
 |          These are passed to the model
 |      kwargs : extra keyword arguments
 |          These are passed to the model with one exception. The
 |          ``eval_env`` keyword is passed to patsy. It can be either a
 |          :class:`patsy:patsy.EvalEnvironment` object or an integer
 |          indicating the depth of the namespace to use. For example, the
 |          default ``eval_env=0`` uses the calling namespace. If you wish
 |          to use a "clean" environment set ``eval_env=-1``.
 |      
 |      
 |      Returns
 |      -------
 |      model : Model instance
 |      
 |      Notes
 |      ------
 |      data must define __getitem__ with the keys in the formula terms
 |      args and kwargs are passed on to the model instantiation. E.g.,
 |      a numpy structured or rec array, a dictionary, or a pandas DataFrame.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from statsmodels.base.model.Model:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  endog_names
 |  
 |  exog_names


In [49]:
df2[['is_bus_day', 'start_station_id']][df2.is_bus_day == True].groupby('start_station_id') \
    .count().compute().sort_values('is_bus_day', ascending=False)


Out[49]:
is_bus_day
start_station_id
519 356749
497 235838
521 233325
435 228568
293 214210
402 208014
490 206323
477 199371
426 196368
285 196145
151 178778
444 177460
284 176299
379 174838
523 173904
459 170351
382 166108
327 164709
168 161661
368 160471
358 153488
359 152552
472 152160
127 149701
492 148322
417 146737
499 146562
517 145338
537 144539
446 140362
... ...
3302 825
3332 754
3401 713
3424 698
3348 667
3229 658
3338 645
3340 561
3333 509
3440 478
3395 453
3330 420
3393 393
3394 325
3342 303
3326 284
3130 201
3245 177
3219 170
3014 100
3371 40
3250 25
3432 24
3239 18
3017 10
3036 10
3040 8
3266 7
3240 3
3385 1

659 rows × 1 columns


In [50]:
zz = df2[['start_station_id', 'is_bus_day', 'start_time', 'start_station_name']].groupby('start_station_id') \
 .count().compute().sort_values('is_bus_day', ascending=False)
zz


Out[50]:
is_bus_day start_time start_station_name
start_station_id
519 397813 397813 397813
497 315510 315510 315510
435 299698 299698 299698
426 280868 280868 280868
293 279981 279981 279981
521 268807 268807 268807
285 266162 266162 266162
402 260880 260880 260880
151 247791 247791 247791
490 245801 245801 245801
284 236954 236954 236954
477 229820 229820 229820
444 229799 229799 229799
459 227690 227690 227690
368 227610 227610 227610
382 219669 219669 219669
327 215974 215974 215974
499 215419 215419 215419
358 213026 213026 213026
168 204413 204413 204413
379 200189 200189 200189
523 196410 196410 196410
387 191457 191457 191457
2006 189364 189364 189364
127 187794 187794 187794
514 186694 186694 186694
3002 185525 185525 185525
446 181832 181832 181832
472 180320 180320 180320
504 179485 179485 179485
... ... ... ...
3424 929 929 929
3229 916 916 916
3338 900 900 900
255 897 897 897
3340 794 794 794
3333 669 669 669
3330 610 610 610
3440 588 588 588
3395 567 567 567
3393 562 562 562
3394 450 450 450
3326 429 429 429
3342 410 410 410
3130 351 351 351
3014 324 324 324
3219 180 180 180
3257 178 178 178
3245 177 177 177
3253 62 62 62
3371 53 53 53
3250 25 25 25
3432 24 24 24
3252 20 20 20
3239 18 18 18
3017 12 12 12
3040 11 11 11
3036 10 10 10
3266 7 7 7
3240 3 3 3
3385 1 1 1

662 rows × 3 columns


In [51]:
zz[zz.is_bus_day>500].index


Out[51]:
Int64Index([ 519,  497,  435,  426,  293,  521,  285,  402,  151,  490,
            ...
            3424, 3229, 3338,  255, 3340, 3333, 3330, 3440, 3395, 3393],
           dtype='int64', name='start_station_id', length=642)

In [53]:
df3 = df2[df2.start_station_id.isin(zz[zz.is_bus_day>500].index)]

In [56]:
df3.count().compute()


Out[56]:
start_time                 36899280
trip_duration              36899280
stop_time                  36899280
start_station_id           36899280
start_station_name         36899280
start_station_latitude     36899280
start_station_longitude    36899280
end_station_id             36899280
end_station_name           36899280
end_station_latitude       36899280
end_station_longitude      36899280
bike_id                    36899280
user_type                  36863421
birth_year                 32541201
gender                     36899280
start_taxizone_id          36899266
end_taxizone_id            36898966
is_bus_day                 36899280
dtype: int64

In [58]:
df.count().compute()


Out[58]:
trip_duration              36902025
stop_time                  36902025
start_station_id           36902025
start_station_name         36902025
start_station_latitude     36902025
start_station_longitude    36902025
end_station_id             36902025
end_station_name           36902025
end_station_latitude       36902025
end_station_longitude      36902025
bike_id                    36902025
user_type                  36866154
birth_year                 32543206
gender                     36902025
start_taxizone_id          36902006
end_taxizone_id            36901708
dtype: int64

In [63]:
import json

In [70]:
df_stations = pd.DataFrame(json.load(open("../15_dataframe_analysis/stations.2017.04.20.09.43.json"))['stationBeanList'])

In [ ]: