In [1460]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'

# import data
data = pd.read_csv(filename)

INITIAL DATA EXPLORATION


In [1461]:
print "SHAPE: " + str(data.shape)
data.head(3)


SHAPE: (42649, 27)
Out[1461]:
UNIT DATEn TIMEn ENTRIESn EXITSn ENTRIESn_hourly EXITSn_hourly datetime hour day_week ... pressurei rain tempi wspdi meanprecipi meanpressurei meantempi meanwspdi weather_lat weather_lon
0 R003 05-01-11 00:00:00 4388333 2911002 0 0 2011-05-01 00:00:00 0 6 ... 30.22 0 55.9 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
1 R003 05-01-11 04:00:00 4388333 2911002 0 0 2011-05-01 04:00:00 4 6 ... 30.25 0 52.0 3.5 0 30.258 55.98 7.86 40.700348 -73.887177
2 R003 05-01-11 12:00:00 4388333 2911002 0 0 2011-05-01 12:00:00 12 6 ... 30.28 0 62.1 6.9 0 30.258 55.98 7.86 40.700348 -73.887177

3 rows × 27 columns


In [1462]:
print "COLUMNAR DATA TYPES"
data.dtypes


COLUMNAR DATA TYPES
Out[1462]:
UNIT                object
DATEn               object
TIMEn               object
ENTRIESn             int64
EXITSn               int64
ENTRIESn_hourly    float64
EXITSn_hourly      float64
datetime            object
hour                 int64
day_week             int64
weekday              int64
station             object
latitude           float64
longitude          float64
conds               object
fog                  int64
precipi            float64
pressurei          float64
rain                 int64
tempi              float64
wspdi              float64
meanprecipi        float64
meanpressurei      float64
meantempi          float64
meanwspdi          float64
weather_lat        float64
weather_lon        float64
dtype: object

In [1463]:
data['ENTRIESn_hourly'].describe()


Out[1463]:
count    42649.000000
mean      1886.589955
std       2952.385585
min          0.000000
25%        274.000000
50%        905.000000
75%       2255.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

DATA AND FUNCTIONS FOR GATHERING INITIAL STATISTICS


In [1464]:
entries_hourly_by_row = data['ENTRIESn_hourly'].values

In [1465]:
def map_column_to_entries_hourly(column):
    instances = column.values # e.g., longitude_instances = data['longitude'].values
    
    # reduce
    entries_hourly = {} # e.g., longitude_entries_hourly = {}
    for i in np.arange(len(instances)): 
        if instances[i] in entries_hourly:
            entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
        else:
            entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
            
    return entries_hourly # e.g., longitudes, entries

In [1466]:
def display_basic_stats(entries_hourly_dict, column1name):
    # e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
    df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
    
    p = inflect.engine()
    print "{0} AND THEIR ENTRIES".format(p.plural(column1name.upper()))
    print df.head(3)
    
    print 
    print pd.DataFrame(df['entries']).describe()
    print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries_hourly_dict.values())))
    
    return df # e.g, longitude_df

In [1467]:
def plot_data(df, column1name, plot_kind, xaxis_labeled):
    
    p = inflect.engine()
    if xaxis_labeled == True:
        df.plot(x=column1name, y='entries', title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5)
        plt.xlabel(column1name)
    else:
        df.plot(title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5)
        plt.xlabel("{0} row index".format(column1name))
        
    plt.ylabel('{0} entries'.format(column1name))
    plt.legend(['entries'])
    plt.show()

In [1468]:
def plot_histogram(df, column_name, num_of_bins):
    df[column_name].plot(kind='hist', bins=num_of_bins, alpha=0.5, color='green')
    plt.ylabel('frequency')
    plt.show()

UNIT STATISTICS


In [1469]:
unit_entries_hourly = map_column_to_entries_hourly(data['UNIT'])
unit_df = display_basic_stats(unit_entries_hourly, 'unit')
plot_data(unit_df, 'unit', 'line', False)


UNITS AND THEIR ENTRIES
   unit  entries
0  R318   112098
1  R319   254531
2  R312    73913

              entries
count      240.000000
mean    335254.895833
std     334849.388932
min          0.000000
25%     131148.000000
50%     221479.500000
75%     409285.750000
max    1868674.000000
range  1868674.000000

In [1470]:
print data.groupby('UNIT')['DATEn'].count().head()
print
print data['UNIT'].describe()


UNIT
R003    168
R004    175
R005    172
R006    180
R007    170
Name: DATEn, dtype: int64

count     42649
unique      240
top        R084
freq        186
Name: UNIT, dtype: object

[At first, there is no clear reason why UNIT row-counts differ from one another. However, given further examniation below, it seems clear that row-counts are related (if not determined) by the number of entries each unit receives, as evidenced by the most frequently occuring unit in the data set (R084) receiving the highest number of hourly entries.]


In [1594]:
units = data['UNIT'].value_counts()
print units[units == 186]

for unit in units.index:
    unit_df.loc[unit_df[unit_df['unit'] == unit].index[0], 'frequency'] = units[unit]
    
unit_df.sort(columns=['frequency', 'unit'], ascending=False, inplace=True)
print unit_df[unit_df['frequency'] == 186].shape


R084    186
R086    186
R019    186
R017    186
R013    186
R012    186
R099    186
R062    186
R239    186
R025    186
R321    186
R127    186
R257    186
R070    186
R080    186
R083    186
R220    186
R046    186
R044    186
R043    186
R126    186
R098    186
R049    186
R291    186
R020    186
R022    186
R023    186
R029    186
R108    186
R102    186
R101    186
R105    186
R116    186
R172    186
R033    186
R031    186
R030    186
R115    186
R137    186
R111    186
R163    186
R179    186
R041    186
R027    186
R211    186
R204    186
R227    186
R055    186
R057    186
R051    186
R276    186
R300    186
Length: 52, dtype: int64
(52, 3)

DATE STATISTICS


In [1472]:
date_entries_hourly = map_column_to_entries_hourly(data['DATEn'])
date_df = display_basic_stats(date_entries_hourly, 'date')
plot_data(date_df, 'date', 'line', False)


DATES AND THEIR ENTRIES
       date  entries
0  05-30-11  1409572
1  05-15-11  1413929
2  05-04-11  3118915

              entries
count       31.000000
mean   2595521.774194
std     710440.834289
min    1400098.000000
25%    1891834.000000
50%    3009536.000000
75%    3137683.000000
max    3201840.000000
range  1801742.000000

HOUR STATISTICS


In [1473]:
hour_entries_hourly = map_column_to_entries_hourly(data['hour'])
hour_df = display_basic_stats(hour_entries_hourly, 'hour')
plot_data(hour_df, 'hour', 'line', True)


HOURS AND THEIR ENTRIES
   hour   entries
0     0  10353167
1     4   2300788
2     8   5198583

               entries
count         6.000000
mean   13410195.833333
std     8863957.086415
min     2300788.000000
25%     6487229.000000
50%    13593103.500000
75%    20772247.000000
max    23690281.000000
range  21389493.00000

WEEKDAY STATISTICS


In [1474]:
weekday_entries_hourly = map_column_to_entries_hourly(data['day_week'])
weekday_df = display_basic_stats(weekday_entries_hourly, 'weekday')
plot_data(weekday_df, 'weekday', 'line', True)


WEEKDAYS AND THEIR ENTRIES
   weekday   entries
0        0  12795107
1        1  15246943
2        2  12592691

               entries
count         7.000000
mean   11494453.571429
std     2989933.638739
min     7218706.000000
25%     9949293.000000
50%    12592691.000000
75%    12752124.500000
max    15246943.000000
range  8028237.000000

STATION STATISTICS


In [1475]:
station_entries_hourly = map_column_to_entries_hourly(data['station'])
station_df = display_basic_stats(station_entries_hourly, 'station')
plot_data(station_df, 'station', 'line', False)


STATIONS AND THEIR ENTRIES
           station  entries
0  LEXINGTON-53 ST   930423
1           207 ST   160382
2      BEACH 67 ST    82119

              entries
count      207.000000
mean    388701.328502
std     457501.301121
min          0.000000
25%     140102.000000
50%     225183.000000
75%     473735.000000
max    2920887.000000
range  2920887.000000

LATITUDE STATISTICS


In [1476]:
latitude_entries_hourly = map_column_to_entries_hourly(data['latitude'])
latitude_df = display_basic_stats(latitude_entries_hourly, 'latitude')
plot_data(latitude_df, 'latitude', 'scatter', True)
plot_histogram(latitude_df, 'latitude', 15)


LATITUDES AND THEIR ENTRIES
    latitude  entries
0  40.852417     7559
1  40.707840   209745
2  40.643982   102508

              entries
count      233.000000
mean    345326.931330
std     393653.267874
min          0.000000
25%     131511.000000
50%     218938.000000
75%     402883.000000
max    2920887.000000
range  2920887.000000

LONGITUDE STATISTICS


In [1477]:
longitude_entries_hourly = map_column_to_entries_hourly(data['longitude'])
longitude_df = display_basic_stats(longitude_entries_hourly, 'longitude')
plot_data(longitude_df, 'longitude', 'scatter', True)
plot_histogram(longitude_df, 'longitude', 10)


LONGITUDES AND THEIR ENTRIES
   longitude  entries
0 -73.977417   911174
1 -73.828125   193792
2 -74.014099   694605

              entries
count      234.000000
mean    343851.175214
std     393424.158576
min          0.000000
25%     130422.000000
50%     217648.000000
75%     402551.250000
max    2920887.000000
range  2920887.000000

RAIN STATISTICS


In [1478]:
rain_entries_hourly = map_column_to_entries_hourly(data['rain'])
rain_df = display_basic_stats(rain_entries_hourly, 'rain')
plot_data(rain_df, 'rain', 'bar', True)


RAINS AND THEIR ENTRIES
   rain   entries
0     0  61020916
1     1  19440259

               entries
count         2.000000
mean   40230587.500000
std    29401964.530892
min    19440259.000000
25%    29835423.250000
50%    40230587.500000
75%    50625751.750000
max    61020916.000000
range  41580657.00000

In [1479]:
rain_days = data[data['rain'] == 0]
no_rain_days = data[data['rain'] == 1]

print "RAIN DAYS"
print rain_days['ENTRIESn_hourly'].describe()
print
print "NO-RAIN DAYS"
print no_rain_days['ENTRIESn_hourly'].describe()


RAIN DAYS
count    33064.000000
mean      1845.539439
std       2878.770848
min          0.000000
25%        269.000000
50%        893.000000
75%       2197.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

NO-RAIN DAYS
count     9585.000000
mean      2028.196035
std       3189.433373
min          0.000000
25%        295.000000
50%        939.000000
75%       2424.000000
max      32289.000000
Name: ENTRIESn_hourly, dtype: float64

In [1480]:
rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=20, alpha=0.5, color='blue')
no_rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=15, alpha=0.5, color='yellow')
plt.title('ENTRIESn_hourly HISTOGRAM (by RAIN)')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['rain', 'no rain'])
plt.show()


Rainy/Non-rainy days are technically not random, although they may be considered random for most non-meterological purposes. Moreover, rainy/non-rainy days tend to cluster (for meterological reasons). In the current data set, certain days are labeled as both 'rain' and 'no rain', which, assumedly, means that rain occurrred in certain locations while it did not in others on the same day. [Thankfully, the current data set is not as double-minded when it reports (only) either 'rain' or 'no-rain' at individual station locations in a single day.]


In [1481]:
date_and_rain = data[['DATEn', 'rain']].drop_duplicates()
date_and_rain.sort(columns='DATEn', inplace=True)
print date_and_rain.head()

dates = data['DATEn'].unique()
rain_dates = date_and_rain[date_and_rain['rain'] == 1]['DATEn'].unique()
no_rain_dates = date_and_rain[date_and_rain['rain'] == 0]['DATEn'].unique()

indices_of_rain_dates = []
for rain_date in rain_dates:
    indices_of_rain_dates.append(np.where(dates == rain_date)[0][0])

indices_of_no_rain_dates = []
for no_rain_date in no_rain_dates:
    indices_of_no_rain_dates.append(np.where(dates == no_rain_date)[0][0])

plt.title('RAIN AND NO-RAIN DAYS')
plt.xticks(np.arange(len(dates)), dates, rotation='vertical')
plt.yticks([0,1])
plt.ylabel('rain')

plt.scatter(indices_of_rain_dates, np.ones(len(indices_of_rain_dates)), color='blue')
plt.scatter(indices_of_no_rain_dates, np.zeros(len(indices_of_no_rain_dates)), color='yellow', edgecolors='black')

plt.legend(['rain', 'no rain'], bbox_to_anchor=(1.05, 1), loc=2)
plt.show()


          DATEn  rain
0      05-01-11     0
5      05-02-11     0
11     05-03-11     0
16     05-04-11     1
32542  05-04-11     0

In [1481]:


INITIAL EXPLORATION SUMMARY

While days-with-rain occur in greater number in this data set (thus, contributing to any possible higher-frequency counts), the distribution of ENTRIESn_hourly for rain and no-rain days seems comparable according to the above histogram.

In contrast, the non-weather-related data

Section 1: Statistical Test

1.a Which statistical test did you use to analyze the NYC subway data?


In [1482]:
# perform statistical tests on rain/no-rain days to compare means and stds

1.b Did you use a one-tail or a two-tail P value?

1.c What is the null hypothesis?