In [1943]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'

# import data
data = pd.read_csv(filename)

INITIAL DATA EXPLORATION


In [1944]:
print "SHAPE: " + str(data.shape)
data.head(3)


SHAPE: (42649, 27)
Out[1944]:
UNIT DATEn TIMEn ENTRIESn EXITSn ENTRIESn_hourly EXITSn_hourly datetime hour day_week ... pressurei rain tempi wspdi meanprecipi meanpressurei meantempi meanwspdi weather_lat weather_lon
0 R003 05-01-11 00:00:00 4388333 2911002 0 0 2011-05-01 00:00:00 0 6 ... 30.22 0 55.9 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
1 R003 05-01-11 04:00:00 4388333 2911002 0 0 2011-05-01 04:00:00 4 6 ... 30.25 0 52.0 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
2 R003 05-01-11 12:00:00 4388333 2911002 0 0 2011-05-01 12:00:00 12 6 ... 30.28 0 62.1 6.9 0 30.258 55.98 7.86 40.700348 -73.887177

3 rows × 27 columns


In [1945]:
print "COLUMNAR DATA TYPES"
data.dtypes


COLUMNAR DATA TYPES
Out[1945]:
UNIT                object
DATEn               object
TIMEn               object
ENTRIESn             int64
EXITSn               int64
ENTRIESn_hourly    float64
EXITSn_hourly      float64
datetime            object
hour                 int64
day_week             int64
weekday              int64
station             object
latitude           float64
longitude          float64
conds               object
fog                  int64
precipi            float64
pressurei          float64
rain                 int64
tempi              float64
wspdi              float64
meanprecipi        float64
meanpressurei      float64
meantempi          float64
meanwspdi          float64
weather_lat        float64
weather_lon        float64
dtype: object

In [1946]:
data['ENTRIESn_hourly'].describe()


Out[1946]:
count    42649.000000
mean      1886.589955
std       2952.385585
min          0.000000
25%        274.000000
50%        905.000000
75%       2255.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

DATA AND FUNCTIONS FOR GATHERING INITIAL STATISTICS

[ N.B. Due to decisions described the Unit_Entries supplement, in the current analysis, unless otherwise noted, entries will refer to a summation of ENTRIESn_hourly per UNIT (i.e., not, as might be expected, values in the ENTRIESn column). ]


In [1947]:
entries_hourly_by_row = data['ENTRIESn_hourly'].values

In [1948]:
def map_column_to_entries_hourly(column):
    instances = column.values # e.g., longitude_instances = data['longitude'].values
    
    # reduce
    entries_hourly = {} # e.g., longitude_entries_hourly = {}
    for i in np.arange(len(instances)): 
        if instances[i] in entries_hourly:
            entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
        else:
            entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
            
    return entries_hourly # e.g., longitudes, entries

In [1949]:
def display_basic_stats(entries_hourly_dict, column1name):
    # e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
    df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
    
    p = inflect.engine()
    print "{0} AND THEIR ENTRIES".format(p.plural(column1name.upper()))
    print df.head(3)
    
    print 
    print pd.DataFrame(df['entries']).describe()
    print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries_hourly_dict.values())))
    
    return df # e.g, longitude_df

In [1950]:
def plot_data(df, column1name, plot_kind, xaxis_labeled):
    
    p = inflect.engine()
    if xaxis_labeled == True:
        df.plot(x=column1name, y='entries', title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
        plt.xlabel(column1name)
    else:
        df.plot(title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
        plt.xlabel("{0} row index".format(column1name))
        
    plt.ylabel('{0} entries'.format(column1name))
    plt.legend(['entries'])
    plt.show()

In [1951]:
def plot_histogram(df, column_name, num_of_bins):
    df[column_name].plot(kind='hist', bins=num_of_bins, alpha=0.5, color='green')
    plt.ylabel('frequency')
    plt.show()

UNIT STATISTICS


In [1952]:
unit_entries_hourly = map_column_to_entries_hourly(data['UNIT'])
unit_df = display_basic_stats(unit_entries_hourly, 'unit')
plot_data(unit_df, 'unit', 'line', False)


UNITS AND THEIR ENTRIES
   unit  entries
0  R318   112098
1  R319   254531
2  R312    73913

              entries
count      240.000000
mean    335254.895833
std     334849.388932
min          0.000000
25%     131148.000000
50%     221479.500000
75%     409285.750000
max    1868674.000000
range  1868674.000000

UNIT SUMMARY

Clearly, certain units received more entries than other units.

DATE STATISTICS


In [1956]:
date_entries_hourly = map_column_to_entries_hourly(data['DATEn'])
date_df = display_basic_stats(date_entries_hourly, 'date')
plot_data(date_df, 'date', 'line', False)


DATES AND THEIR ENTRIES
       date  entries
0  05-30-11  1409572
1  05-15-11  1413929
2  05-04-11  3118915

              entries
count       31.000000
mean   2595521.774194
std     710440.834289
min    1400098.000000
25%    1891834.000000
50%    3009536.000000
75%    3137683.000000
max    3201840.000000
range  1801742.000000

DATE SUMMARY

Clearly, certain dates received more entries than other dates.

HOUR STATISTICS


In [1957]:
hour_entries_hourly = map_column_to_entries_hourly(data['hour'])
hour_df = display_basic_stats(hour_entries_hourly, 'hour')
plot_data(hour_df, 'hour', 'line', True)


HOURS AND THEIR ENTRIES
   hour   entries
0     0  10353167
1     4   2300788
2     8   5198583

               entries
count         6.000000
mean   13410195.833333
std     8863957.086415
min     2300788.000000
25%     6487229.000000
50%    13593103.500000
75%    20772247.000000
max    23690281.000000
range  21389493.00000

HOUR SUMMARY

Clearly, certain hours received more entries than other hours.

WEEKDAY STATISTICS


In [1958]:
weekday_entries_hourly = map_column_to_entries_hourly(data['day_week'])
weekday_df = display_basic_stats(weekday_entries_hourly, 'weekday')
plot_data(weekday_df, 'weekday', 'line', True)


WEEKDAYS AND THEIR ENTRIES
   weekday   entries
0        0  12795107
1        1  15246943
2        2  12592691

               entries
count         7.000000
mean   11494453.571429
std     2989933.638739
min     7218706.000000
25%     9949293.000000
50%    12592691.000000
75%    12752124.500000
max    15246943.000000
range  8028237.000000

WEEKDAY SUMMARY

Clearly, certain weekdays received more entries than other weekdays.

STATION STATISTICS


In [1959]:
station_entries_hourly = map_column_to_entries_hourly(data['station'])
station_df = display_basic_stats(station_entries_hourly, 'station')
plot_data(station_df, 'station', 'line', False)


STATIONS AND THEIR ENTRIES
           station  entries
0  LEXINGTON-53 ST   930423
1           207 ST   160382
2      BEACH 67 ST    82119

              entries
count      207.000000
mean    388701.328502
std     457501.301121
min          0.000000
25%     140102.000000
50%     225183.000000
75%     473735.000000
max    2920887.000000
range  2920887.000000

STATION SUMMARY

Clearly, certain stations received more entries than other stations.

LATITUDE STATISTICS


In [1960]:
latitude_entries_hourly = map_column_to_entries_hourly(data['latitude'])
latitude_df = display_basic_stats(latitude_entries_hourly, 'latitude')
plot_data(latitude_df, 'latitude', 'scatter', True)
plot_histogram(latitude_df, 'latitude', 15)


LATITUDES AND THEIR ENTRIES
    latitude  entries
0  40.852417     7559
1  40.707840   209745
2  40.643982   102508

              entries
count      233.000000
mean    345326.931330
std     393653.267874
min          0.000000
25%     131511.000000
50%     218938.000000
75%     402883.000000
max    2920887.000000
range  2920887.000000

LONGITUDE STATISTICS


In [1961]:
longitude_entries_hourly = map_column_to_entries_hourly(data['longitude'])
longitude_df = display_basic_stats(longitude_entries_hourly, 'longitude')
plot_data(longitude_df, 'longitude', 'scatter', True)
plot_histogram(longitude_df, 'longitude', 10)


LONGITUDES AND THEIR ENTRIES
   longitude  entries
0 -73.977417   911174
1 -73.828125   193792
2 -74.014099   694605

              entries
count      234.000000
mean    343851.175214
std     393424.158576
min          0.000000
25%     130422.000000
50%     217648.000000
75%     402551.250000
max    2920887.000000
range  2920887.000000

RAIN STATISTICS


In [1962]:
rain_entries_hourly = map_column_to_entries_hourly(data['rain'])
rain_df = display_basic_stats(rain_entries_hourly, 'rain')
plot_data(rain_df, 'rain', 'bar', True)


RAINS AND THEIR ENTRIES
   rain   entries
0     0  61020916
1     1  19440259

               entries
count         2.000000
mean   40230587.500000
std    29401964.530892
min    19440259.000000
25%    29835423.250000
50%    40230587.500000
75%    50625751.750000
max    61020916.000000
range  41580657.00000

In [1963]:
rain_days = data[data['rain'] == 0]
no_rain_days = data[data['rain'] == 1]

print "RAIN DAYS"
print rain_days['ENTRIESn_hourly'].describe()
print
print "NO-RAIN DAYS"
print no_rain_days['ENTRIESn_hourly'].describe()


RAIN DAYS
count    33064.000000
mean      1845.539439
std       2878.770848
min          0.000000
25%        269.000000
50%        893.000000
75%       2197.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

NO-RAIN DAYS
count     9585.000000
mean      2028.196035
std       3189.433373
min          0.000000
25%        295.000000
50%        939.000000
75%       2424.000000
max      32289.000000
Name: ENTRIESn_hourly, dtype: float64

In [1964]:
rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=20, alpha=0.5, color='blue')
no_rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=15, alpha=0.5, color='yellow')
plt.title('ENTRIESn_hourly HISTOGRAM (by RAIN)')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['rain', 'no rain'])
plt.show()


RAIN SUMMARY

While days-with-rain occur in greater number in this data set (thus, contributing to higher-frequency counts), the distribution of ENTRIESn_hourly for rain and no-rain days seems comparable according to the above histogram.


Section 1: Statistical Test

1.a Which statistical test did you use to analyze the NYC subway data?


In [1968]:
# perform statistical tests on rain/no-rain days to compare means and stds

1.b Did you use a one-tail or a two-tail P value?

1.c What is the null hypothesis?