In [195]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'

# import data
data = pd.read_csv(filename)

INITIAL DATA EXPLORATION


In [196]:
print "SHAPE: " + str(data.shape)
data.head(3)


SHAPE: (42649, 27)
Out[196]:
UNIT DATEn TIMEn ENTRIESn EXITSn ENTRIESn_hourly EXITSn_hourly datetime hour day_week ... pressurei rain tempi wspdi meanprecipi meanpressurei meantempi meanwspdi weather_lat weather_lon
0 R003 05-01-11 00:00:00 4388333 2911002 0 0 2011-05-01 00:00:00 0 6 ... 30.22 0 55.9 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
1 R003 05-01-11 04:00:00 4388333 2911002 0 0 2011-05-01 04:00:00 4 6 ... 30.25 0 52.0 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
2 R003 05-01-11 12:00:00 4388333 2911002 0 0 2011-05-01 12:00:00 12 6 ... 30.28 0 62.1 6.9 0 30.258 55.98 7.86 40.700348 -73.887177

3 rows × 27 columns


In [216]:
print "COLUMNAR DATA TYPES"
data.dtypes


COLUMNAR DATA TYPES
Out[216]:
UNIT                object
DATEn               object
TIMEn               object
ENTRIESn             int64
EXITSn               int64
ENTRIESn_hourly    float64
EXITSn_hourly      float64
datetime            object
hour                 int64
day_week             int64
weekday              int64
station             object
latitude           float64
longitude          float64
conds               object
fog                  int64
precipi            float64
pressurei          float64
rain                 int64
tempi              float64
wspdi              float64
meanprecipi        float64
meanpressurei      float64
meantempi          float64
meanwspdi          float64
weather_lat        float64
weather_lon        float64
dtype: object

In [245]:
data['ENTRIESn'].describe()


Out[245]:
count    4.264900e+04
mean     2.812486e+07
std      3.043607e+07
min      0.000000e+00
25%      1.039762e+07
50%      1.818389e+07
75%      3.263049e+07
max      2.357746e+08
Name: ENTRIESn, dtype: float64

In [246]:
data['ENTRIESn_hourly'].describe()


Out[246]:
count    42649.000000
mean      1886.589955
std       2952.385585
min          0.000000
25%        274.000000
50%        905.000000
75%       2255.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

In [244]:
units = data['UNIT'].unique()
dates = data['DATEn'].unique()
hours = data['hour'].unique()
days_of_week = data['day_week'].unique()
stations = data['station'].unique()
latitudes = data['latitude'].unique()
longitudes = data['longitude'].unique()

In [269]:
unit_instances = data['UNIT'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
unit_entries_hourly = {}
for i in xrange(len(unit_instances)):
    if unit_instances[i] in unit_entries_hourly:
        unit_entries_hourly[unit_instances[i]] += float(entries_hourly_by_row[i])
    else:
        unit_entries_hourly[unit_instances[i]] = float(entries_hourly_by_row[i])

#print unit_entries_hourly
units = unit_entries_hourly.keys()
entries = unit_entries_hourly.values()


Out[269]:
[112098.0,
 254531.0,
 73913.0,
 36338.0,
 241903.0,
 85438.0,
 94036.0,
 177904.0,
 45543.0,
 184434.0,
 293381.0,
 153078.0,
 116383.0,
 511992.0,
 209522.0,
 155371.0,
 159268.0,
 60224.0,
 159814.0,
 98229.0,
 1270579.0,
 403636.0,
 331633.0,
 558241.0,
 442353.0,
 206614.0,
 80864.0,
 363785.0,
 359393.0,
 447802.0,
 344195.0,
 86561.0,
 151399.0,
 157716.0,
 82119.0,
 422663.0,
 82515.0,
 209745.0,
 80484.0,
 362755.0,
 177531.0,
 102508.0,
 36838.0,
 183444.0,
 335474.0,
 180810.0,
 97439.0,
 176509.0,
 0.0,
 19342.0,
 84786.0,
 980152.0,
 1555117.0,
 206430.0,
 873157.0,
 273436.0,
 115057.0,
 539955.0,
 236138.0,
 579548.0,
 156504.0,
 418894.0,
 518690.0,
 362244.0,
 327832.0,
 207313.0,
 281194.0,
 488454.0,
 178630.0,
 155671.0,
 629472.0,
 291034.0,
 4712.0,
 490741.0,
 130059.0,
 382147.0,
 208079.0,
 472827.0,
 250206.0,
 158729.0,
 80711.0,
 192339.0,
 957974.0,
 743466.0,
 591194.0,
 224021.0,
 1554806.0,
 274135.0,
 911174.0,
 272296.0,
 218938.0,
 122505.0,
 349720.0,
 131511.0,
 607617.0,
 320565.0,
 173012.0,
 244618.0,
 603738.0,
 170265.0,
 102294.0,
 126909.0,
 247963.0,
 176371.0,
 155245.0,
 116094.0,
 117983.0,
 30809.0,
 587490.0,
 52135.0,
 263759.0,
 225183.0,
 325646.0,
 459276.0,
 115569.0,
 190940.0,
 154040.0,
 308178.0,
 137363.0,
 175548.0,
 247005.0,
 380286.0,
 99526.0,
 598274.0,
 997807.0,
 52105.0,
 555000.0,
 1188515.0,
 867701.0,
 1773372.0,
 1147515.0,
 251360.0,
 693923.0,
 528599.0,
 106902.0,
 106942.0,
 1347727.0,
 628767.0,
 256989.0,
 53154.0,
 7559.0,
 245029.0,
 24694.0,
 501431.0,
 346577.0,
 29445.0,
 190500.0,
 96939.0,
 183210.0,
 191989.0,
 402883.0,
 371732.0,
 120687.0,
 256425.0,
 113474.0,
 401556.0,
 144377.0,
 257661.0,
 206879.0,
 138437.0,
 465945.0,
 203422.0,
 101453.0,
 206035.0,
 120579.0,
 151474.0,
 216358.0,
 151063.0,
 119378.0,
 406083.0,
 287175.0,
 479759.0,
 160382.0,
 144627.0,
 521054.0,
 211297.0,
 1534652.0,
 826329.0,
 812412.0,
 579583.0,
 295934.0,
 142841.0,
 45118.0,
 193792.0,
 262739.0,
 74223.0,
 154200.0,
 332024.0,
 209820.0,
 196469.0,
 268065.0,
 441844.0,
 304297.0,
 276722.0,
 278119.0,
 306828.0,
 101359.0,
 132477.0,
 901825.0,
 354853.0,
 282236.0,
 584479.0,
 674799.0,
 656181.0,
 486567.0,
 232348.0,
 1868674.0,
 485343.0,
 37472.0,
 230998.0,
 114148.0,
 64965.0,
 97003.0,
 94855.0,
 69452.0,
 58318.0,
 40341.0,
 99889.0,
 136383.0,
 186363.0,
 138680.0,
 116852.0,
 235822.0,
 242258.0,
 327951.0,
 211106.0,
 76079.0,
 292797.0,
 297412.0,
 607785.0,
 1444569.0,
 204547.0,
 293341.0,
 783761.0,
 146662.0,
 1355492.0,
 483372.0,
 1618262.0,
 174453.0,
 474643.0]

UNIT STATISTICS


In [212]:
#print data.groupby('UNIT')['UNIT'].count().head(3)
unit_counts = data.groupby('UNIT')['UNIT'].count().values

unit_df = pd.DataFrame(data['UNIT'].unique(), columns=['UNIT'])
unit_df['COUNT'] = unit_counts

print "UNITS AND THEIR FREQUENCIES"
print unit_df.head(3)
unit_df.describe()


UNITS AND THEIR FREQUENCIES
   UNIT  COUNT
0  R003    168
1  R004    175
2  R005    172
Out[212]:
COUNT
count 240.000000
mean 177.704167
std 9.842270
min 88.000000
25% 172.000000
50% 180.000000
75% 185.000000
max 186.000000

In [203]:
unit_df.plot(title='UNIT USAGE FREQUENCIES')
plt.xlabel('unit row index')
plt.ylabel('unit usage frequency')
plt.show()


DATE STATISTICS


In [213]:
#print data.groupby('DATEn')['DATEn'].count().head(3)
date_counts = data.groupby('DATEn')['DATEn'].count().values

date_df = pd.DataFrame(data['DATEn'].unique(), columns=['DATEn'])
date_df['COUNT'] = date_counts

print "DATES AND THEIR FREQUENCIES"
print date_df.head(3)
date_df.describe()


DATES AND THEIR FREQUENCIES
      DATEn  COUNT
0  05-01-11   1361
1  05-02-11   1388
2  05-03-11   1407
Out[213]:
COUNT
count 31.000000
mean 1375.774194
std 24.529859
min 1332.000000
25% 1357.500000
50% 1375.000000
75% 1399.500000
max 1417.000000

In [205]:
date_df.plot(title='DATE USAGE FREQUENCIES')
plt.xlabel('date row index')
plt.ylabel('date usage frequency')
plt.show()


STATION STATISTICS


In [219]:
#print data.groupby('station')['station'].count().head(3)
station_counts = data.groupby('station')['station'].count().values

station_df = pd.DataFrame(data['station'].unique(), columns=['STATION'])
station_df['COUNT'] = station_counts

print "STATIONS AND THEIR FREQUENCIES"
print station_df.head(3)
station_df.describe()


STATIONS AND THEIR FREQUENCIES
          STATION  COUNT
0   CYPRESS HILLS    185
1    ELDERTS LANE    182
2  FOREST PARKWAY    170
Out[219]:
COUNT
count 207.000000
mean 206.033816
std 78.946423
min 88.000000
25% 172.000000
50% 182.000000
75% 186.000000
max 558.000000

In [209]:
station_df.plot(title='STATION USAGE FREQUENCIES')
plt.xlabel('station row index')
plt.ylabel('station usage frequency')
plt.show()


STATION AND ENTRIES EXPLORATION


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Section 1: Statistical Test

1.a Which statistical test did you use to analyze the NYC subway data?

1.b Did you use a one-tail or a two-tail P value?

1.c What is the null hypothesis?