notebook.community

Edit and run



In [1]:

    
%matplotlib inline
# %config InlineBackend.figure_format='retina'
import dask.dataframe as dd
import numpy as np
import pandas as pd
# import seaborn as sgn
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt



In [12]:

    
plt.rcParams['savefig.dpi'] = 125



In [2]:

    
# df = dd.read_parquet('/bigdata/citibike.parquet')
# df.set_index('start_time', npartitions='auto').to_parquet('/bigdata/citibike_repartitioned.parquet')



In [22]:

    
df = dd.read_parquet('/bigdata/citibike_repartitioned.parquet')



In [23]:

    
def get_is_bus_day(x):
    return np.is_busday(x.index.values.astype('<M8[D]'))

def get_day(x):
    return pd.to_datetime(x.index.values.astype('<M8[D]').astype(str), infer_datetime_format=True)

def get_week(x):
    return pd.to_datetime(x.index.values.astype('<M8[W]').astype(str), infer_datetime_format=True)


df['year_month_day'] = (df.map_partitions(get_day, meta=('year_month_day', '<M8[ns]')))
df['year_month_week'] = (df.map_partitions(get_week, meta=('year_month_week', '<M8[ns]')))
df['is_bus_day'] = df.map_partitions(get_is_bus_day, meta=('is_bus_day', np.bool))



In [24]:

    
df[df.is_bus_day == False].head()









    Out[24]:






  
    
      
      trip_duration
      stop_time
      start_station_id
      start_station_name
      start_station_latitude
      start_station_longitude
      end_station_id
      end_station_name
      end_station_latitude
      end_station_longitude
      bike_id
      user_type
      birth_year
      gender
      start_taxizone_id
      end_taxizone_id
      year_month_day
      year_month_week
      is_bus_day
    
    
      start_time
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2013-07-06 00:00:43
      1593
      2013-07-06 00:27:16
      224
      Spruce St & Nassau St
      40.711464
      -74.005524
      232
      Cadman Plaza E & Tillary St
      40.695977
      -73.990149
      15355
      Customer
      NaN
      0
      209.0
      65.0
      2013-07-06
      2013-07-04
      False
    
    
      2013-07-06 00:00:49
      173
      2013-07-06 00:03:42
      482
      W 15 St & 7 Ave
      40.739355
      -73.999318
      482
      W 15 St & 7 Ave
      40.739355
      -73.999318
      14850
      Subscriber
      1958.0
      1
      90.0
      90.0
      2013-07-06
      2013-07-04
      False
    
    
      2013-07-06 00:00:56
      1494
      2013-07-06 00:25:50
      282
      Kent Ave & S 11 St
      40.708273
      -73.968341
      539
      Metropolitan Ave & Bedford Ave
      40.715348
      -73.960241
      16233
      Customer
      NaN
      0
      256.0
      255.0
      2013-07-06
      2013-07-04
      False
    
    
      2013-07-06 00:01:08
      938
      2013-07-06 00:16:46
      164
      E 47 St & 2 Ave
      40.753231
      -73.970325
      2022
      E 59 St & Sutton Pl
      40.758491
      -73.959206
      20186
      Customer
      NaN
      0
      233.0
      140.0
      2013-07-06
      2013-07-04
      False
    
    
      2013-07-06 00:01:10
      349
      2013-07-06 00:06:59
      229
      Great Jones St
      40.727434
      -73.993790
      428
      E 3 St & 1 Ave
      40.724677
      -73.987834
      15735
      Subscriber
      1986.0
      1
      114.0
      79.0
      2013-07-06
      2013-07-04
      False



In [26]:

    
timedf = df['trip_duration year_month_day'.split()].groupby('year_month_day').count().compute()
timedf_wk = df['trip_duration year_month_week'.split()].groupby('year_month_week').count().compute()



In [58]:

    
timedf2 = df['trip_duration year_month_day'.split()][df.user_type == 'Customer'].groupby('year_month_day').count().compute()
timedf2_wk = df['trip_duration year_month_week'.split()][df.user_type=='Customer'].groupby('year_month_week').count().compute()



In [60]:

    
timedf3 = df['trip_duration year_month_day'.split()][df.user_type == 'Subscriber'].groupby('year_month_day').count().compute()
timedf3_wk = df['trip_duration year_month_week'.split()][df.user_type=='Subscriber'].groupby('year_month_week').count().compute()



In [7]:

    
import matplotlib.dates as mdates



In [100]:

    
plt.plot(timedf.index.values, timedf.trip_duration, alpha=0.4, color='C0', label='Trips per day')
plt.plot(timedf_wk.index.values, timedf_wk.trip_duration/7., color='C0', alpha=1, label='One week average of trips per day')

plt.plot(timedf2.index.values, timedf2.trip_duration, alpha=0.4, color='C1', label='Guests Trips per day')
plt.plot(timedf2_wk.index.values, timedf2_wk.trip_duration/7., color='C1', alpha=1, label='Guests One week average of trips per day')

plt.plot(timedf3.index.values, timedf3.trip_duration, alpha=0.4, color='C2', label='Subscriber Trips per day')
plt.plot(timedf3_wk.index.values, timedf3_wk.trip_duration/7., color='C2', alpha=1, label='Subscriber One week average of trips per day')

plt.xlabel('Date')
plt.ylabel("Trips Per Day")
plt.gcf().set_size_inches(10, 5)

# years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
yearsFmt = mdates.DateFormatter('%b %Y')

ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
ax.xaxis.set_minor_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(yearsFmt)

ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(10000))
ax.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(2500))

plt.legend(loc='upper left')

plt.xlim('2013-07-01', '2017-01-01')
# plt.xlim('2016-06-01', '2016-08-01')
# ax.xaxis.set_major_locator(mdates.WeekdayLocator())
# ax.xaxis.set_minor_locator(mdates.WeekdayLocator())
# ax.xaxis.set_major_formatter(yearsFmt)


plt.ylim(0, 75000)
# ax.xaxis.set_minor_locator(months)

plt.gcf().autofmt_xdate()



In [44]:

    
weather_df = pd.read_csv('../central_park_weather.csv.gz')



In [53]:

    
weather_df['DATE'] = pd.to_datetime(weather_df.DATE, format='%Y%m%d')



In [64]:

    
merged_counts = timedf.merge(weather_df['DATE PRCP SNWD SNOW TMAX TMIN AWND'.split()], left_index=True, right_on='DATE', how='left')



In [73]:

    
merged_counts.columns = ['N', 'DATE', 'PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN',
       'AWND']



In [81]:

    
import statsmodels.api
import statsmodels.formula.api
from patsy import standardize as st



In [91]:

    
model = statsmodels.formula.api.ols(formula="N ~ st(PRCP) + st(SNWD) + st(SNOW) + st(TMAX) + st(TMIN)", data=merged_counts)
model.fit().summary()









    Out[91]:





OLS Regression Results

  Dep. Variable:             N           R-squared:             0.549 


  Model:                    OLS          Adj. R-squared:        0.547 


  Method:              Least Squares     F-statistic:           308.9 


  Date:              Fri, 19 May 2017    Prob (F-statistic):  1.81e-216


  Time:                  15:09:19        Log-Likelihood:      -13493. 


  No. Observations:         1276         AIC:                2.700e+04


  Df Residuals:             1270         BIC:                2.703e+04


  Df Model:                    5                                      


  Covariance Type:       nonrobust                                    




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept   2.892e+04    265.571    108.898   0.000   2.84e+04  2.94e+04


  st(PRCP)   -3210.2392    273.517    -11.737   0.000  -3746.835 -2673.644


  st(SNWD)   -2102.8777    299.407     -7.023   0.000  -2690.265 -1515.490


  st(SNOW)    -182.5400    284.662     -0.641   0.521   -741.000   375.920


  st(TMAX)    8357.3333   1012.308      8.256   0.000   6371.353  1.03e+04


  st(TMIN)     422.5314   1020.685      0.414   0.679  -1579.882  2424.945




  Omnibus:        126.581    Durbin-Watson:         0.431


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    168.763


  Skew:            0.797     Prob(JB):           2.26e-37


  Kurtosis:        3.798     Cond. No.               8.23



In [95]:

    
model = statsmodels.formula.api.ols(formula="N ~ st(PRCP) + st(SNWD) + st(SNOW) + st(TMAX) + st(TMIN)", data=merged_counts)
model.fit().summary()









    Out[95]:





OLS Regression Results

  Dep. Variable:             N           R-squared:             0.549 


  Model:                    OLS          Adj. R-squared:        0.547 


  Method:              Least Squares     F-statistic:           308.9 


  Date:              Fri, 19 May 2017    Prob (F-statistic):  1.81e-216


  Time:                  15:10:17        Log-Likelihood:      -13493. 


  No. Observations:         1276         AIC:                2.700e+04


  Df Residuals:             1270         BIC:                2.703e+04


  Df Model:                    5                                      


  Covariance Type:       nonrobust                                    




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept   2.892e+04    265.571    108.898   0.000   2.84e+04  2.94e+04


  st(PRCP)   -3210.2392    273.517    -11.737   0.000  -3746.835 -2673.644


  st(SNWD)   -2102.8777    299.407     -7.023   0.000  -2690.265 -1515.490


  st(SNOW)    -182.5400    284.662     -0.641   0.521   -741.000   375.920


  st(TMAX)    8357.3333   1012.308      8.256   0.000   6371.353  1.03e+04


  st(TMIN)     422.5314   1020.685      0.414   0.679  -1579.882  2424.945




  Omnibus:        126.581    Durbin-Watson:         0.431


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    168.763


  Skew:            0.797     Prob(JB):           2.26e-37


  Kurtosis:        3.798     Cond. No.               8.23



In [68]:

    
help(statsmodels.api.OLS)









    



Help on class OLS in module statsmodels.regression.linear_model:

class OLS(WLS)
 |  A simple ordinary least squares model.
 |  
 |  
 |  Parameters
 |  ----------
 |  endog : array-like
 |      1-d endogenous response variable. The dependent variable.
 |  exog : array-like
 |      A nobs x k array where `nobs` is the number of observations and `k`
 |      is the number of regressors. An intercept is not included by default
 |      and should be added by the user. See
 |      :func:`statsmodels.tools.add_constant`.
 |  missing : str
 |      Available options are 'none', 'drop', and 'raise'. If 'none', no nan
 |      checking is done. If 'drop', any observations with nans are dropped.
 |      If 'raise', an error is raised. Default is 'none.'
 |  hasconst : None or bool
 |      Indicates whether the RHS includes a user-supplied constant. If True,
 |      a constant is not checked for and k_constant is set to 1 and all
 |      result statistics are calculated as if a constant is present. If
 |      False, a constant is not checked for and k_constant is set to 0.
 |  
 |  
 |  Attributes
 |  ----------
 |  weights : scalar
 |      Has an attribute weights = array(1.0) due to inheritance from WLS.
 |  
 |  See Also
 |  --------
 |  GLS
 |  
 |  Examples
 |  --------
 |  >>> import numpy as np
 |  >>>
 |  >>> import statsmodels.api as sm
 |  >>>
 |  >>> Y = [1,3,4,5,2,3,4]
 |  >>> X = range(1,8)
 |  >>> X = sm.add_constant(X)
 |  >>>
 |  >>> model = sm.OLS(Y,X)
 |  >>> results = model.fit()
 |  >>> results.params
 |  array([ 2.14285714,  0.25      ])
 |  >>> results.tvalues
 |  array([ 1.87867287,  0.98019606])
 |  >>> print(results.t_test([1, 0])))
 |  <T test: effect=array([ 2.14285714]), sd=array([[ 1.14062282]]), t=array([[ 1.87867287]]), p=array([[ 0.05953974]]), df_denom=5>
 |  >>> print(results.f_test(np.identity(2)))
 |  <F test: F=array([[ 19.46078431]]), p=[[ 0.00437251]], df_denom=5, df_num=2>
 |  
 |  Notes
 |  -----
 |  No constant is added by the model unless you are using formulas.
 |  
 |  Method resolution order:
 |      OLS
 |      WLS
 |      RegressionModel
 |      statsmodels.base.model.LikelihoodModel
 |      statsmodels.base.model.Model
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, endog, exog=None, missing='none', hasconst=None, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  loglike(self, params)
 |      The likelihood function for the clasical OLS model.
 |      
 |      Parameters
 |      ----------
 |      params : array-like
 |          The coefficients with which to estimate the log-likelihood.
 |      
 |      Returns
 |      -------
 |      The concentrated likelihood function evaluated at params.
 |  
 |  whiten(self, Y)
 |      OLS model whitener does nothing: returns Y.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from RegressionModel:
 |  
 |  fit(self, method='pinv', cov_type='nonrobust', cov_kwds=None, use_t=None, **kwargs)
 |      Full fit of the model.
 |      
 |      The results include an estimate of covariance matrix, (whitened)
 |      residuals and an estimate of scale.
 |      
 |      Parameters
 |      ----------
 |      method : str
 |          Can be "pinv", "qr".  "pinv" uses the Moore-Penrose pseudoinverse
 |          to solve the least squares problem. "qr" uses the QR
 |          factorization.
 |      
 |      Returns
 |      -------
 |      A RegressionResults class instance.
 |      
 |      See Also
 |      ---------
 |      regression.RegressionResults
 |      
 |      Notes
 |      -----
 |      The fit method uses the pseudoinverse of the design/exogenous variables
 |      to solve the least squares minimization.
 |  
 |  fit_regularized(self, method='coord_descent', maxiter=1000, alpha=0.0, L1_wt=1.0, start_params=None, cnvrg_tol=1e-08, zero_tol=1e-08, **kwargs)
 |      Return a regularized fit to a linear regression model.
 |      
 |      Parameters
 |      ----------
 |      method : string
 |          Only the coordinate descent algorithm is implemented.
 |      maxiter : integer
 |          The maximum number of iteration cycles (an iteration cycle
 |          involves running coordinate descent on all variables).
 |      alpha : scalar or array-like
 |          The penalty weight.  If a scalar, the same penalty weight
 |          applies to all variables in the model.  If a vector, it
 |          must have the same length as `params`, and contains a
 |          penalty weight for each coefficient.
 |      L1_wt : scalar
 |          The fraction of the penalty given to the L1 penalty term.
 |          Must be between 0 and 1 (inclusive).  If 0, the fit is
 |          ridge regression.  If 1, the fit is the lasso.
 |      start_params : array-like
 |          Starting values for ``params``.
 |      cnvrg_tol : scalar
 |          If ``params`` changes by less than this amount (in sup-norm)
 |          in once iteration cycle, the algorithm terminates with
 |          convergence.
 |      zero_tol : scalar
 |          Any estimated coefficient smaller than this value is
 |          replaced with zero.
 |      
 |      Returns
 |      -------
 |      A RegressionResults object, of the same type returned by
 |      ``fit``.
 |      
 |      Notes
 |      -----
 |      The approach closely follows that implemented in the glmnet
 |      package in R.  The penalty is the "elastic net" penalty, which
 |      is a convex combination of L1 and L2 penalties.
 |      
 |      The function that is minimized is: ..math::
 |      
 |          0.5*RSS/n + alpha*((1-L1_wt)*|params|_2^2/2 + L1_wt*|params|_1)
 |      
 |      where RSS is the usual regression sum of squares, n is the
 |      sample size, and :math:`|*|_1` and :math:`|*|_2` are the L1 and L2
 |      norms.
 |      
 |      Post-estimation results are based on the same data used to
 |      select variables, hence may be subject to overfitting biases.
 |      
 |      References
 |      ----------
 |      Friedman, Hastie, Tibshirani (2008).  Regularization paths for
 |      generalized linear models via coordinate descent.  Journal of
 |      Statistical Software 33(1), 1-22 Feb 2010.
 |  
 |  initialize(self)
 |      Initialize (possibly re-initialize) a Model instance. For
 |      instance, the design matrix of a linear model may change
 |      and some things must be recomputed.
 |  
 |  predict(self, params, exog=None)
 |      Return linear predicted values from a design matrix.
 |      
 |      Parameters
 |      ----------
 |      params : array-like
 |          Parameters of a linear model
 |      exog : array-like, optional.
 |          Design / exogenous data. Model exog is used if None.
 |      
 |      Returns
 |      -------
 |      An array of fitted values
 |      
 |      Notes
 |      -----
 |      If the model has not yet been fit, params is not optional.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from RegressionModel:
 |  
 |  df_model
 |      The model degree of freedom, defined as the rank of the regressor
 |      matrix minus 1 if a constant is included.
 |  
 |  df_resid
 |      The residual degree of freedom, defined as the number of observations
 |      minus the rank of the regressor matrix.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from statsmodels.base.model.LikelihoodModel:
 |  
 |  hessian(self, params)
 |      The Hessian matrix of the model
 |  
 |  information(self, params)
 |      Fisher information matrix of model
 |      
 |      Returns -Hessian of loglike evaluated at params.
 |  
 |  score(self, params)
 |      Score vector of model.
 |      
 |      The gradient of logL with respect to each parameter.
 |  
 |  ----------------------------------------------------------------------
 |  Class methods inherited from statsmodels.base.model.Model:
 |  
 |  from_formula(formula, data, subset=None, *args, **kwargs) from builtins.type
 |      Create a Model from a formula and dataframe.
 |      
 |      Parameters
 |      ----------
 |      formula : str or generic Formula object
 |          The formula specifying the model
 |      data : array-like
 |          The data for the model. See Notes.
 |      subset : array-like
 |          An array-like object of booleans, integers, or index values that
 |          indicate the subset of df to use in the model. Assumes df is a
 |          `pandas.DataFrame`
 |      args : extra arguments
 |          These are passed to the model
 |      kwargs : extra keyword arguments
 |          These are passed to the model with one exception. The
 |          ``eval_env`` keyword is passed to patsy. It can be either a
 |          :class:`patsy:patsy.EvalEnvironment` object or an integer
 |          indicating the depth of the namespace to use. For example, the
 |          default ``eval_env=0`` uses the calling namespace. If you wish
 |          to use a "clean" environment set ``eval_env=-1``.
 |      
 |      
 |      Returns
 |      -------
 |      model : Model instance
 |      
 |      Notes
 |      ------
 |      data must define __getitem__ with the keys in the formula terms
 |      args and kwargs are passed on to the model instantiation. E.g.,
 |      a numpy structured or rec array, a dictionary, or a pandas DataFrame.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from statsmodels.base.model.Model:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  endog_names
 |  
 |  exog_names



In [49]:

    
df2[['is_bus_day', 'start_station_id']][df2.is_bus_day == True].groupby('start_station_id') \
    .count().compute().sort_values('is_bus_day', ascending=False)









    Out[49]:






  
    
      
      is_bus_day
    
    
      start_station_id
      
    
  
  
    
      519
      356749
    
    
      497
      235838
    
    
      521
      233325
    
    
      435
      228568
    
    
      293
      214210
    
    
      402
      208014
    
    
      490
      206323
    
    
      477
      199371
    
    
      426
      196368
    
    
      285
      196145
    
    
      151
      178778
    
    
      444
      177460
    
    
      284
      176299
    
    
      379
      174838
    
    
      523
      173904
    
    
      459
      170351
    
    
      382
      166108
    
    
      327
      164709
    
    
      168
      161661
    
    
      368
      160471
    
    
      358
      153488
    
    
      359
      152552
    
    
      472
      152160
    
    
      127
      149701
    
    
      492
      148322
    
    
      417
      146737
    
    
      499
      146562
    
    
      517
      145338
    
    
      537
      144539
    
    
      446
      140362
    
    
      ...
      ...
    
    
      3302
      825
    
    
      3332
      754
    
    
      3401
      713
    
    
      3424
      698
    
    
      3348
      667
    
    
      3229
      658
    
    
      3338
      645
    
    
      3340
      561
    
    
      3333
      509
    
    
      3440
      478
    
    
      3395
      453
    
    
      3330
      420
    
    
      3393
      393
    
    
      3394
      325
    
    
      3342
      303
    
    
      3326
      284
    
    
      3130
      201
    
    
      3245
      177
    
    
      3219
      170
    
    
      3014
      100
    
    
      3371
      40
    
    
      3250
      25
    
    
      3432
      24
    
    
      3239
      18
    
    
      3017
      10
    
    
      3036
      10
    
    
      3040
      8
    
    
      3266
      7
    
    
      3240
      3
    
    
      3385
      1
    
  

659 rows × 1 columns



In [50]:

    
zz = df2[['start_station_id', 'is_bus_day', 'start_time', 'start_station_name']].groupby('start_station_id') \
 .count().compute().sort_values('is_bus_day', ascending=False)
zz









    Out[50]:






  
    
      
      is_bus_day
      start_time
      start_station_name
    
    
      start_station_id
      
      
      
    
  
  
    
      519
      397813
      397813
      397813
    
    
      497
      315510
      315510
      315510
    
    
      435
      299698
      299698
      299698
    
    
      426
      280868
      280868
      280868
    
    
      293
      279981
      279981
      279981
    
    
      521
      268807
      268807
      268807
    
    
      285
      266162
      266162
      266162
    
    
      402
      260880
      260880
      260880
    
    
      151
      247791
      247791
      247791
    
    
      490
      245801
      245801
      245801
    
    
      284
      236954
      236954
      236954
    
    
      477
      229820
      229820
      229820
    
    
      444
      229799
      229799
      229799
    
    
      459
      227690
      227690
      227690
    
    
      368
      227610
      227610
      227610
    
    
      382
      219669
      219669
      219669
    
    
      327
      215974
      215974
      215974
    
    
      499
      215419
      215419
      215419
    
    
      358
      213026
      213026
      213026
    
    
      168
      204413
      204413
      204413
    
    
      379
      200189
      200189
      200189
    
    
      523
      196410
      196410
      196410
    
    
      387
      191457
      191457
      191457
    
    
      2006
      189364
      189364
      189364
    
    
      127
      187794
      187794
      187794
    
    
      514
      186694
      186694
      186694
    
    
      3002
      185525
      185525
      185525
    
    
      446
      181832
      181832
      181832
    
    
      472
      180320
      180320
      180320
    
    
      504
      179485
      179485
      179485
    
    
      ...
      ...
      ...
      ...
    
    
      3424
      929
      929
      929
    
    
      3229
      916
      916
      916
    
    
      3338
      900
      900
      900
    
    
      255
      897
      897
      897
    
    
      3340
      794
      794
      794
    
    
      3333
      669
      669
      669
    
    
      3330
      610
      610
      610
    
    
      3440
      588
      588
      588
    
    
      3395
      567
      567
      567
    
    
      3393
      562
      562
      562
    
    
      3394
      450
      450
      450
    
    
      3326
      429
      429
      429
    
    
      3342
      410
      410
      410
    
    
      3130
      351
      351
      351
    
    
      3014
      324
      324
      324
    
    
      3219
      180
      180
      180
    
    
      3257
      178
      178
      178
    
    
      3245
      177
      177
      177
    
    
      3253
      62
      62
      62
    
    
      3371
      53
      53
      53
    
    
      3250
      25
      25
      25
    
    
      3432
      24
      24
      24
    
    
      3252
      20
      20
      20
    
    
      3239
      18
      18
      18
    
    
      3017
      12
      12
      12
    
    
      3040
      11
      11
      11
    
    
      3036
      10
      10
      10
    
    
      3266
      7
      7
      7
    
    
      3240
      3
      3
      3
    
    
      3385
      1
      1
      1
    
  

662 rows × 3 columns



In [51]:

    
zz[zz.is_bus_day>500].index









    Out[51]:





Int64Index([ 519,  497,  435,  426,  293,  521,  285,  402,  151,  490,
            ...
            3424, 3229, 3338,  255, 3340, 3333, 3330, 3440, 3395, 3393],
           dtype='int64', name='start_station_id', length=642)



In [53]:

    
df3 = df2[df2.start_station_id.isin(zz[zz.is_bus_day>500].index)]



In [56]:

    
df3.count().compute()









    Out[56]:





start_time                 36899280
trip_duration              36899280
stop_time                  36899280
start_station_id           36899280
start_station_name         36899280
start_station_latitude     36899280
start_station_longitude    36899280
end_station_id             36899280
end_station_name           36899280
end_station_latitude       36899280
end_station_longitude      36899280
bike_id                    36899280
user_type                  36863421
birth_year                 32541201
gender                     36899280
start_taxizone_id          36899266
end_taxizone_id            36898966
is_bus_day                 36899280
dtype: int64



In [58]:

    
df.count().compute()









    Out[58]:





trip_duration              36902025
stop_time                  36902025
start_station_id           36902025
start_station_name         36902025
start_station_latitude     36902025
start_station_longitude    36902025
end_station_id             36902025
end_station_name           36902025
end_station_latitude       36902025
end_station_longitude      36902025
bike_id                    36902025
user_type                  36866154
birth_year                 32543206
gender                     36902025
start_taxizone_id          36902006
end_taxizone_id            36901708
dtype: int64



In [63]:

    
import json



In [70]:

    
df_stations = pd.DataFrame(json.load(open("../15_dataframe_analysis/stations.2017.04.20.09.43.json"))['stationBeanList'])



In [ ]:

	trip_duration	stop_time	start_station_id	start_station_name	start_station_latitude	start_station_longitude	end_station_id	end_station_name	end_station_latitude	end_station_longitude	bike_id	user_type	birth_year	gender	start_taxizone_id	end_taxizone_id	year_month_day	year_month_week	is_bus_day
start_time
2013-07-06 00:00:43	1593	2013-07-06 00:27:16	224	Spruce St & Nassau St	40.711464	-74.005524	232	Cadman Plaza E & Tillary St	40.695977	-73.990149	15355	Customer	NaN	0	209.0	65.0	2013-07-06	2013-07-04	False
2013-07-06 00:00:49	173	2013-07-06 00:03:42	482	W 15 St & 7 Ave	40.739355	-73.999318	482	W 15 St & 7 Ave	40.739355	-73.999318	14850	Subscriber	1958.0	1	90.0	90.0	2013-07-06	2013-07-04	False
2013-07-06 00:00:56	1494	2013-07-06 00:25:50	282	Kent Ave & S 11 St	40.708273	-73.968341	539	Metropolitan Ave & Bedford Ave	40.715348	-73.960241	16233	Customer	NaN	0	256.0	255.0	2013-07-06	2013-07-04	False
2013-07-06 00:01:08	938	2013-07-06 00:16:46	164	E 47 St & 2 Ave	40.753231	-73.970325	2022	E 59 St & Sutton Pl	40.758491	-73.959206	20186	Customer	NaN	0	233.0	140.0	2013-07-06	2013-07-04	False
2013-07-06 00:01:10	349	2013-07-06 00:06:59	229	Great Jones St	40.727434	-73.993790	428	E 3 St & 1 Ave	40.724677	-73.987834	15735	Subscriber	1986.0	1	114.0	79.0	2013-07-06	2013-07-04	False

Dep. Variable:	N	R-squared:	0.549
Model:	OLS	Adj. R-squared:	0.547
Method:	Least Squares	F-statistic:	308.9
Date:	Fri, 19 May 2017	Prob (F-statistic):	1.81e-216
Time:	15:09:19	Log-Likelihood:	-13493.
No. Observations:	1276	AIC:	2.700e+04
Df Residuals:	1270	BIC:	2.703e+04
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	2.892e+04	265.571	108.898	0.000	2.84e+04 2.94e+04
st(PRCP)	-3210.2392	273.517	-11.737	0.000	-3746.835 -2673.644
st(SNWD)	-2102.8777	299.407	-7.023	0.000	-2690.265 -1515.490
st(SNOW)	-182.5400	284.662	-0.641	0.521	-741.000 375.920
st(TMAX)	8357.3333	1012.308	8.256	0.000	6371.353 1.03e+04
st(TMIN)	422.5314	1020.685	0.414	0.679	-1579.882 2424.945

Omnibus:	126.581	Durbin-Watson:	0.431
Prob(Omnibus):	0.000	Jarque-Bera (JB):	168.763
Skew:	0.797	Prob(JB):	2.26e-37
Kurtosis:	3.798	Cond. No.	8.23

	is_bus_day
start_station_id
519	356749
497	235838
521	233325
435	228568
293	214210
402	208014
490	206323
477	199371
426	196368
285	196145
151	178778
444	177460
284	176299
379	174838
523	173904
459	170351
382	166108
327	164709
168	161661
368	160471
358	153488
359	152552
472	152160
127	149701
492	148322
417	146737
499	146562
517	145338
537	144539
446	140362
...	...
3302	825
3332	754
3401	713
3424	698
3348	667
3229	658
3338	645
3340	561
3333	509
3440	478
3395	453
3330	420
3393	393
3394	325
3342	303
3326	284
3130	201
3245	177
3219	170
3014	100
3371	40
3250	25
3432	24
3239	18
3017	10
3036	10
3040	8
3266	7
3240	3
3385	1

	is_bus_day	start_time	start_station_name
start_station_id
519	397813	397813	397813
497	315510	315510	315510
435	299698	299698	299698
426	280868	280868	280868
293	279981	279981	279981
521	268807	268807	268807
285	266162	266162	266162
402	260880	260880	260880
151	247791	247791	247791
490	245801	245801	245801
284	236954	236954	236954
477	229820	229820	229820
444	229799	229799	229799
459	227690	227690	227690
368	227610	227610	227610
382	219669	219669	219669
327	215974	215974	215974
499	215419	215419	215419
358	213026	213026	213026
168	204413	204413	204413
379	200189	200189	200189
523	196410	196410	196410
387	191457	191457	191457
2006	189364	189364	189364
127	187794	187794	187794
514	186694	186694	186694
3002	185525	185525	185525
446	181832	181832	181832
472	180320	180320	180320
504	179485	179485	179485
...	...	...	...
3424	929	929	929
3229	916	916	916
3338	900	900	900
255	897	897	897
3340	794	794	794
3333	669	669	669
3330	610	610	610
3440	588	588	588
3395	567	567	567
3393	562	562	562
3394	450	450	450
3326	429	429	429
3342	410	410	410
3130	351	351	351
3014	324	324	324
3219	180	180	180
3257	178	178	178
3245	177	177	177
3253	62	62	62
3371	53	53	53
3250	25	25	25
3432	24	24	24
3252	20	20	20
3239	18	18	18
3017	12	12	12
3040	11	11	11
3036	10	10	10
3266	7	7	7
3240	3	3	3
3385	1	1	1