In [1943]:

    
import inflect # for string manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'

# import data
data = pd.read_csv(filename)

INITIAL DATA EXPLORATION



In [1944]:

    
print "SHAPE: " + str(data.shape)
data.head(3)









    



SHAPE: (42649, 27)






    Out[1944]:






  
    
      
      UNIT
      DATEn
      TIMEn
      ENTRIESn
      EXITSn
      ENTRIESn_hourly
      EXITSn_hourly
      datetime
      hour
      day_week
      ...
      pressurei
      rain
      tempi
      wspdi
      meanprecipi
      meanpressurei
      meantempi
      meanwspdi
      weather_lat
      weather_lon
    
  
  
    
      0
       R003
       05-01-11
       00:00:00
       4388333
       2911002
       0
       0
       2011-05-01 00:00:00
        0
       6
      ...
       30.22
       0
       55.9
       3.5
       0
       30.258
       55.98
       7.86
       40.700348
      -73.887177
    
    
      1
       R003
       05-01-11
       04:00:00
       4388333
       2911002
       0
       0
       2011-05-01 04:00:00
        4
       6
      ...
       30.25
       0
       52.0
       3.5
       0
       30.258
       55.98
       7.86
       40.700348
      -73.887177
    
    
      2
       R003
       05-01-11
       12:00:00
       4388333
       2911002
       0
       0
       2011-05-01 12:00:00
       12
       6
      ...
       30.28
       0
       62.1
       6.9
       0
       30.258
       55.98
       7.86
       40.700348
      -73.887177
    
  

3 rows × 27 columns



In [1945]:

    
print "COLUMNAR DATA TYPES"
data.dtypes









    



COLUMNAR DATA TYPES






    Out[1945]:





UNIT                object
DATEn               object
TIMEn               object
ENTRIESn             int64
EXITSn               int64
ENTRIESn_hourly    float64
EXITSn_hourly      float64
datetime            object
hour                 int64
day_week             int64
weekday              int64
station             object
latitude           float64
longitude          float64
conds               object
fog                  int64
precipi            float64
pressurei          float64
rain                 int64
tempi              float64
wspdi              float64
meanprecipi        float64
meanpressurei      float64
meantempi          float64
meanwspdi          float64
weather_lat        float64
weather_lon        float64
dtype: object



In [1946]:

    
data['ENTRIESn_hourly'].describe()









    Out[1946]:





count    42649.000000
mean      1886.589955
std       2952.385585
min          0.000000
25%        274.000000
50%        905.000000
75%       2255.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

DATA AND FUNCTIONS FOR GATHERING INITIAL STATISTICS

[ N.B. Due to decisions described the Unit_Entries supplement, in the current analysis, unless otherwise noted, entries will refer to a summation of ENTRIESn_hourly per UNIT (i.e., not, as might be expected, values in the ENTRIESn column). ]



In [1947]:

    
entries_hourly_by_row = data['ENTRIESn_hourly'].values



In [1948]:

    
def map_column_to_entries_hourly(column):
    instances = column.values # e.g., longitude_instances = data['longitude'].values
    
    # reduce
    entries_hourly = {} # e.g., longitude_entries_hourly = {}
    for i in np.arange(len(instances)): 
        if instances[i] in entries_hourly:
            entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
        else:
            entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
            
    return entries_hourly # e.g., longitudes, entries



In [1949]:

    
def display_basic_stats(entries_hourly_dict, column1name):
    # e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
    df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
    
    p = inflect.engine()
    print "{0} AND THEIR ENTRIES".format(p.plural(column1name.upper()))
    print df.head(3)
    
    print 
    print pd.DataFrame(df['entries']).describe()
    print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries_hourly_dict.values())))
    
    return df # e.g, longitude_df



In [1950]:

    
def plot_data(df, column1name, plot_kind, xaxis_labeled):
    
    p = inflect.engine()
    if xaxis_labeled == True:
        df.plot(x=column1name, y='entries', title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
        plt.xlabel(column1name)
    else:
        df.plot(title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
        plt.xlabel("{0} row index".format(column1name))
        
    plt.ylabel('{0} entries'.format(column1name))
    plt.legend(['entries'])
    plt.show()



In [1951]:

    
def plot_histogram(df, column_name, num_of_bins):
    df[column_name].plot(kind='hist', bins=num_of_bins, alpha=0.5, color='green')
    plt.ylabel('frequency')
    plt.show()

UNIT STATISTICS



In [1952]:

    
unit_entries_hourly = map_column_to_entries_hourly(data['UNIT'])
unit_df = display_basic_stats(unit_entries_hourly, 'unit')
plot_data(unit_df, 'unit', 'line', False)









    



UNITS AND THEIR ENTRIES
   unit  entries
0  R318   112098
1  R319   254531
2  R312    73913

              entries
count      240.000000
mean    335254.895833
std     334849.388932
min          0.000000
25%     131148.000000
50%     221479.500000
75%     409285.750000
max    1868674.000000
range  1868674.000000

UNIT SUMMARY

Clearly, certain units received more entries than other units.

DATE STATISTICS



In [1956]:

    
date_entries_hourly = map_column_to_entries_hourly(data['DATEn'])
date_df = display_basic_stats(date_entries_hourly, 'date')
plot_data(date_df, 'date', 'line', False)









    



DATES AND THEIR ENTRIES
       date  entries
0  05-30-11  1409572
1  05-15-11  1413929
2  05-04-11  3118915

              entries
count       31.000000
mean   2595521.774194
std     710440.834289
min    1400098.000000
25%    1891834.000000
50%    3009536.000000
75%    3137683.000000
max    3201840.000000
range  1801742.000000

DATE SUMMARY

Clearly, certain dates received more entries than other dates.

HOUR STATISTICS



In [1957]:

    
hour_entries_hourly = map_column_to_entries_hourly(data['hour'])
hour_df = display_basic_stats(hour_entries_hourly, 'hour')
plot_data(hour_df, 'hour', 'line', True)









    



HOURS AND THEIR ENTRIES
   hour   entries
0     0  10353167
1     4   2300788
2     8   5198583

               entries
count         6.000000
mean   13410195.833333
std     8863957.086415
min     2300788.000000
25%     6487229.000000
50%    13593103.500000
75%    20772247.000000
max    23690281.000000
range  21389493.00000

HOUR SUMMARY

Clearly, certain hours received more entries than other hours.

WEEKDAY STATISTICS



In [1958]:

    
weekday_entries_hourly = map_column_to_entries_hourly(data['day_week'])
weekday_df = display_basic_stats(weekday_entries_hourly, 'weekday')
plot_data(weekday_df, 'weekday', 'line', True)









    



WEEKDAYS AND THEIR ENTRIES
   weekday   entries
0        0  12795107
1        1  15246943
2        2  12592691

               entries
count         7.000000
mean   11494453.571429
std     2989933.638739
min     7218706.000000
25%     9949293.000000
50%    12592691.000000
75%    12752124.500000
max    15246943.000000
range  8028237.000000

WEEKDAY SUMMARY

Clearly, certain weekdays received more entries than other weekdays.

STATION STATISTICS



In [1959]:

    
station_entries_hourly = map_column_to_entries_hourly(data['station'])
station_df = display_basic_stats(station_entries_hourly, 'station')
plot_data(station_df, 'station', 'line', False)









    



STATIONS AND THEIR ENTRIES
           station  entries
0  LEXINGTON-53 ST   930423
1           207 ST   160382
2      BEACH 67 ST    82119

              entries
count      207.000000
mean    388701.328502
std     457501.301121
min          0.000000
25%     140102.000000
50%     225183.000000
75%     473735.000000
max    2920887.000000
range  2920887.000000

STATION SUMMARY

Clearly, certain stations received more entries than other stations.

LATITUDE STATISTICS



In [1960]:

    
latitude_entries_hourly = map_column_to_entries_hourly(data['latitude'])
latitude_df = display_basic_stats(latitude_entries_hourly, 'latitude')
plot_data(latitude_df, 'latitude', 'scatter', True)
plot_histogram(latitude_df, 'latitude', 15)









    



LATITUDES AND THEIR ENTRIES
    latitude  entries
0  40.852417     7559
1  40.707840   209745
2  40.643982   102508

              entries
count      233.000000
mean    345326.931330
std     393653.267874
min          0.000000
25%     131511.000000
50%     218938.000000
75%     402883.000000
max    2920887.000000
range  2920887.000000

LONGITUDE STATISTICS



In [1961]:

    
longitude_entries_hourly = map_column_to_entries_hourly(data['longitude'])
longitude_df = display_basic_stats(longitude_entries_hourly, 'longitude')
plot_data(longitude_df, 'longitude', 'scatter', True)
plot_histogram(longitude_df, 'longitude', 10)









    



LONGITUDES AND THEIR ENTRIES
   longitude  entries
0 -73.977417   911174
1 -73.828125   193792
2 -74.014099   694605

              entries
count      234.000000
mean    343851.175214
std     393424.158576
min          0.000000
25%     130422.000000
50%     217648.000000
75%     402551.250000
max    2920887.000000
range  2920887.000000

RAIN STATISTICS



In [1962]:

    
rain_entries_hourly = map_column_to_entries_hourly(data['rain'])
rain_df = display_basic_stats(rain_entries_hourly, 'rain')
plot_data(rain_df, 'rain', 'bar', True)









    



RAINS AND THEIR ENTRIES
   rain   entries
0     0  61020916
1     1  19440259

               entries
count         2.000000
mean   40230587.500000
std    29401964.530892
min    19440259.000000
25%    29835423.250000
50%    40230587.500000
75%    50625751.750000
max    61020916.000000
range  41580657.00000



In [1963]:

    
rain_days = data[data['rain'] == 0]
no_rain_days = data[data['rain'] == 1]

print "RAIN DAYS"
print rain_days['ENTRIESn_hourly'].describe()
print
print "NO-RAIN DAYS"
print no_rain_days['ENTRIESn_hourly'].describe()









    



RAIN DAYS
count    33064.000000
mean      1845.539439
std       2878.770848
min          0.000000
25%        269.000000
50%        893.000000
75%       2197.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

NO-RAIN DAYS
count     9585.000000
mean      2028.196035
std       3189.433373
min          0.000000
25%        295.000000
50%        939.000000
75%       2424.000000
max      32289.000000
Name: ENTRIESn_hourly, dtype: float64



In [1964]:

    
rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=20, alpha=0.5, color='blue')
no_rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=15, alpha=0.5, color='yellow')
plt.title('ENTRIESn_hourly HISTOGRAM (by RAIN)')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['rain', 'no rain'])
plt.show()

RAIN SUMMARY

While days-with-rain occur in greater number in this data set (thus, contributing to higher-frequency counts), the distribution of ENTRIESn_hourly for rain and no-rain days seems comparable according to the above histogram.

Section 1: Statistical Test

1.a Which statistical test did you use to analyze the NYC subway data?



In [1968]:

    
# perform statistical tests on rain/no-rain days to compare means and stds

INITIAL DATA EXPLORATION

DATA AND FUNCTIONS FOR GATHERING INITIAL STATISTICS

UNIT STATISTICS

DATE STATISTICS

HOUR STATISTICS

WEEKDAY STATISTICS

STATION STATISTICS

LATITUDE STATISTICS

LONGITUDE STATISTICS

RAIN STATISTICS

Section 1: Statistical Test

1.a Which statistical test did you use to analyze the NYC subway data?

1.b Did you use a one-tail or a two-tail P value?

1.c What is the null hypothesis?

	UNIT	DATEn	TIMEn	ENTRIESn	EXITSn	datetime	hour	day_week	...	pressurei	tempi	wspdi	meanpressurei	meantempi	meanwspdi	weather_lat	weather_lon
0	R003	05-01-11	00:00:00	4388333	2911002	2011-05-01 00:00:00	0	6	...	30.22	55.9	3.5	30.258	55.98	7.86	40.700348	-73.887177
1	R003	05-01-11	04:00:00	4388333	2911002	2011-05-01 04:00:00	4	6	...	30.25	52.0	3.5	30.258	55.98	7.86	40.700348	-73.887177
2	R003	05-01-11	12:00:00	4388333	2911002	2011-05-01 12:00:00	12	6	...	30.28	62.1	6.9	30.258	55.98	7.86	40.700348	-73.887177

INITIAL DATA EXPLORATION

DATA AND FUNCTIONS FOR GATHERING INITIAL STATISTICS

NON-WEATHER-RELATED DATA

UNIT STATISTICS

DATE STATISTICS

HOUR STATISTICS

WEEKDAY STATISTICS

STATION STATISTICS

LATITUDE STATISTICS

LONGITUDE STATISTICS

WEATHER-RELATED DATA

RAIN STATISTICS

Section 1: Statistical Test

1.a Which statistical test did you use to analyze the NYC subway data?

1.b Did you use a one-tail or a two-tail P value?

1.c What is the null hypothesis?