In [510]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'

# import data
data = pd.read_csv(filename)

INITIAL DATA EXPLORATION


In [511]:
print "SHAPE: " + str(data.shape)
data.head(3)


SHAPE: (42649, 27)
Out[511]:
UNIT DATEn TIMEn ENTRIESn EXITSn ENTRIESn_hourly EXITSn_hourly datetime hour day_week ... pressurei rain tempi wspdi meanprecipi meanpressurei meantempi meanwspdi weather_lat weather_lon
0 R003 05-01-11 00:00:00 4388333 2911002 0 0 2011-05-01 00:00:00 0 6 ... 30.22 0 55.9 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
1 R003 05-01-11 04:00:00 4388333 2911002 0 0 2011-05-01 04:00:00 4 6 ... 30.25 0 52.0 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
2 R003 05-01-11 12:00:00 4388333 2911002 0 0 2011-05-01 12:00:00 12 6 ... 30.28 0 62.1 6.9 0 30.258 55.98 7.86 40.700348 -73.887177

3 rows × 27 columns


In [512]:
print "COLUMNAR DATA TYPES"
data.dtypes


COLUMNAR DATA TYPES
Out[512]:
UNIT                object
DATEn               object
TIMEn               object
ENTRIESn             int64
EXITSn               int64
ENTRIESn_hourly    float64
EXITSn_hourly      float64
datetime            object
hour                 int64
day_week             int64
weekday              int64
station             object
latitude           float64
longitude          float64
conds               object
fog                  int64
precipi            float64
pressurei          float64
rain                 int64
tempi              float64
wspdi              float64
meanprecipi        float64
meanpressurei      float64
meantempi          float64
meanwspdi          float64
weather_lat        float64
weather_lon        float64
dtype: object

In [513]:
data['ENTRIESn'].describe()


Out[513]:
count    4.264900e+04
mean     2.812486e+07
std      3.043607e+07
min      0.000000e+00
25%      1.039762e+07
50%      1.818389e+07
75%      3.263049e+07
max      2.357746e+08
Name: ENTRIESn, dtype: float64

In [514]:
data['ENTRIESn_hourly'].describe()


Out[514]:
count    42649.000000
mean      1886.589955
std       2952.385585
min          0.000000
25%        274.000000
50%        905.000000
75%       2255.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

In [538]:
### columnar data to explore
#units = data['UNIT'].unique()
#dates = data['DATEn'].unique()
#hours = data['hour'].unique()
#days_of_week = data['day_week'].unique()
#stations = data['station'].unique()
#latitudes = data['latitude'].unique()
#longitudes = data['longitude'].unique()

UNIT STATISTICS


In [516]:
unit_instances = data['UNIT'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
unit_entries_hourly = {}
for i in np.arange(len(unit_instances)):
    if unit_instances[i] in unit_entries_hourly:
        unit_entries_hourly[unit_instances[i]] += float(entries_hourly_by_row[i])
    else:
        unit_entries_hourly[unit_instances[i]] = float(entries_hourly_by_row[i])

#print unit_entries_hourly
units = unit_entries_hourly.keys()
entries = unit_entries_hourly.values()

In [517]:
unit_df = pd.DataFrame(data=unit_entries_hourly.items(), columns=['unit','entries'])

print "UNITS AND THEIR ENTRIES"
print unit_df.head(3)

print
print unit_df.describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))


UNITS AND THEIR ENTRIES
   unit  entries
0  R318   112098
1  R319   254531
2  R312    73913

              entries
count      240.000000
mean    335254.895833
std     334849.388932
min          0.000000
25%     131148.000000
50%     221479.500000
75%     409285.750000
max    1868674.000000
range  1868674.000000

In [518]:
unit_df.plot(title='UNITS AND THEIR ENTRIES')
plt.xlabel('unit row index')
plt.ylabel('unit entries')
plt.show()


DATE STATISTICS


In [519]:
date_instances = data['DATEn'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
date_entries_hourly = {}
for i in np.arange(len(date_instances)):
    if date_instances[i] in date_entries_hourly:
        date_entries_hourly[date_instances[i]] += float(entries_hourly_by_row[i])
    else:
        date_entries_hourly[date_instances[i]] = float(entries_hourly_by_row[i])

dates = date_entries_hourly.keys()
entries = date_entries_hourly.values()

In [520]:
date_df = pd.DataFrame(data=date_entries_hourly.items(), columns=['date','entries'])

print "DATES AND THEIR ENTRIES"
print date_df.head(3)

print
print date_df.describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))


DATES AND THEIR ENTRIES
       date  entries
0  05-30-11  1409572
1  05-15-11  1413929
2  05-04-11  3118915

              entries
count       31.000000
mean   2595521.774194
std     710440.834289
min    1400098.000000
25%    1891834.000000
50%    3009536.000000
75%    3137683.000000
max    3201840.000000
range  1801742.000000

In [521]:
date_df.plot(title='DATES AND THEIR ENTRIES')
plt.xlabel('date row index')
plt.ylabel('date entries')
plt.show()


HOUR STATISTICS


In [522]:
hour_instances = data['hour'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
hour_entries_hourly = {}
for i in np.arange(len(hour_instances)):
    if hour_instances[i] in hour_entries_hourly:
        hour_entries_hourly[hour_instances[i]] += float(entries_hourly_by_row[i])
    else:
        hour_entries_hourly[hour_instances[i]] = float(entries_hourly_by_row[i])

hours = hour_entries_hourly.keys()
entries = hour_entries_hourly.values()

In [523]:
hour_df = pd.DataFrame(data=hour_entries_hourly.items(), columns=['hour','entries'])

print "HOURS AND THEIR ENTRIES"
print hour_df.head(3)

print 
print pd.DataFrame(hour_df['entries']).describe()
print "{:<7}".format('range') + "{:0<15}".format(str(np.ptp(entries)))


HOURS AND THEIR ENTRIES
   hour   entries
0     0  10353167
1     4   2300788
2     8   5198583

               entries
count         6.000000
mean   13410195.833333
std     8863957.086415
min     2300788.000000
25%     6487229.000000
50%    13593103.500000
75%    20772247.000000
max    23690281.000000
range  21389493.000000

In [524]:
hour_df.plot(x='hour', y='entries', title='HOURS AND THEIR ENTRIES')
plt.xlabel('hour')
plt.ylabel('hour entries')
plt.legend(['entries'])
plt.ylim(np.min(entries), np.max(entries))
plt.show()


DAYS OF WEEK STATISTICS


In [560]:
days_of_week_instances = data['day_week'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
days_of_week_entries_hourly = {}
for i in np.arange(len(days_of_week_instances)):
    if days_of_week_instances[i] in days_of_week_entries_hourly:
        days_of_week_entries_hourly[days_of_week_instances[i]] += float(entries_hourly_by_row[i])
    else:
        days_of_week_entries_hourly[days_of_week_instances[i]] = float(entries_hourly_by_row[i])

days_of_week = days_of_week_entries_hourly.keys()
entries = days_of_week_entries_hourly.values()

In [561]:
days_of_week_df = pd.DataFrame(data=days_of_week_entries_hourly.items(), columns=['day_week','entries'])

print "DAYS OF WEEK AND THEIR ENTRIES"
print days_of_week_df.head(3)

print 
print pd.DataFrame(days_of_week_df['entries']).describe()
print "{:<8}".format('range') + "{:0<14}".format(str(np.ptp(entries)))


DAYS OF WEEK AND THEIR ENTRIES
   day_week   entries
0         0  12795107
1         1  15246943
2         2  12592691

               entries
count         7.000000
mean   11494453.571429
std     2989933.638739
min     7218706.000000
25%     9949293.000000
50%    12592691.000000
75%    12752124.500000
max    15246943.000000
range   8028237.000000

In [563]:
days_of_week_df.plot(x='day_week', y='entries', title='DAYS OF WEEK AND THEIR ENTRIES')
plt.xlabel('day of week')
plt.ylabel('day of week entries')
plt.legend(['entries'])
plt.ylim(np.min(entries), np.max(entries))
plt.show()


STATION STATISTICS


In [527]:
station_instances = data['station'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
station_entries_hourly = {}
for i in np.arange(len(station_instances)):
    if station_instances[i] in station_entries_hourly:
        station_entries_hourly[station_instances[i]] += float(entries_hourly_by_row[i])
    else:
        station_entries_hourly[station_instances[i]] = float(entries_hourly_by_row[i])

stations = station_entries_hourly.keys()
entries = station_entries_hourly.values()

In [528]:
station_df = pd.DataFrame(data=station_entries_hourly.items(), columns=['station','entries'])

print "STATIONS AND THEIR ENTRIES"
print station_df.head(3)

print
print station_df.describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))


STATIONS AND THEIR ENTRIES
           station  entries
0  LEXINGTON-53 ST   930423
1           207 ST   160382
2      BEACH 67 ST    82119

              entries
count      207.000000
mean    388701.328502
std     457501.301121
min          0.000000
25%     140102.000000
50%     225183.000000
75%     473735.000000
max    2920887.000000
range  2920887.000000

In [558]:
station_df.plot(title='STATIONS AND THEIR ENTRIES')
plt.xlabel('station row index')
plt.ylabel('station entries')
plt.show()


LATITUDE STATISTICS


In [552]:
latitude_instances = data['latitude'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
latitude_entries_hourly = {}
for i in np.arange(len(latitude_instances)):
    if latitude_instances[i] in latitude_entries_hourly:
        latitude_entries_hourly[latitude_instances[i]] += float(entries_hourly_by_row[i])
    else:
        latitude_entries_hourly[latitude_instances[i]] = float(entries_hourly_by_row[i])

latitudes = latitude_entries_hourly.keys()
entries = latitude_entries_hourly.values()

In [553]:
latitude_df = pd.DataFrame(data=latitude_entries_hourly.items(), columns=['latitude','entries'])

print "LATITUDES AND THEIR ENTRIES"
print latitude_df.head(3)

print 
print pd.DataFrame(latitude_df['latitude']).describe()
print pd.DataFrame(latitude_df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))


LATITUDES AND THEIR ENTRIES
    latitude  entries
0  40.852417     7559
1  40.707840   209745
2  40.643982   102508

         latitude
count  233.000000
mean    40.724324
std      0.073508
min     40.576152
25%     40.675382
50%     40.714590
75%     40.759578
max     40.889185
              entries
count      233.000000
mean    345326.931330
std     393653.267874
min          0.000000
25%     131511.000000
50%     218938.000000
75%     402883.000000
max    2920887.000000
range  2920887.000000

In [556]:
latitude_df.plot(x='latitude', y='entries', title='LATITUDES AND THEIR ENTRIES', kind='scatter')
plt.xlabel('latitude')
plt.ylabel('latitude entries')
plt.legend(['entries'])
plt.show()



In [ ]:

LONGITUDE STATISTICS


In [564]:
longitude_instances = data['longitude'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values

# reduce
longitude_entries_hourly = {}
for i in np.arange(len(longitude_instances)):
    if longitude_instances[i] in longitude_entries_hourly:
        longitude_entries_hourly[longitude_instances[i]] += float(entries_hourly_by_row[i])
    else:
        longitude_entries_hourly[longitude_instances[i]] = float(entries_hourly_by_row[i])

longitudes = longitude_entries_hourly.keys()
entries = longitude_entries_hourly.values()

In [ ]:
def get_column_values_and_reduce(column1, column2):
    instances = 
    
    return keys, values

In [565]:
longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])

print "LONGITUDES AND THEIR ENTRIES"
print longitude_df.head(3)

print 
print pd.DataFrame(longitude_df['longitude']).describe()
print pd.DataFrame(longitude_df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))


LONGITUDES AND THEIR ENTRIES
   longitude  entries
0 -73.977417   911174
1 -73.828125   193792
2 -74.014099   694605

        longitude
count  234.000000
mean   -73.938272
std      0.060382
min    -74.073622
25%    -73.986375
50%    -73.950959
75%    -73.904319
max    -73.755383
              entries
count      234.000000
mean    343851.175214
std     393424.158576
min          0.000000
25%     130422.000000
50%     217648.000000
75%     402551.250000
max    2920887.000000
range  2920887.000000

In [566]:
longitude_df.plot(x='longitude', y='entries', title='LONGITUDES AND THEIR ENTRIES', kind='scatter')
plt.xlabel('longitude')
plt.ylabel('longitude entries')
plt.legend(['entries'])
plt.show()



In [530]:
#histogram of long and lat

In [530]:


In [530]:


In [530]:


In [530]:


In [530]:


In [530]:

Section 1: Statistical Test

1.a Which statistical test did you use to analyze the NYC subway data?

1.b Did you use a one-tail or a two-tail P value?

1.c What is the null hypothesis?