In [1]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [2]:
print "SHAPE: " + str(data.shape)
data.head(1)
Out[2]:
In [3]:
data.dtypes
Out[3]:
In [4]:
data['ENTRIESn_hourly'].describe()
Out[4]:
[ N.B. Due to decisions described in the *Unit_Entries* supplement, in the current analysis, unless otherwise noted, entries will refer to a summation of ENTRIESn_hourly per UNIT (i.e., not, as might be expected, simply alues in the ENTRIESn column). ]
In [5]:
entries_hourly_by_row = data['ENTRIESn_hourly'].values
In [6]:
def map_column_to_entries_hourly(column):
instances = column.values # e.g., longitude_instances = data['longitude'].values
# reduce
entries_hourly = {} # e.g., longitude_entries_hourly = {}
for i in np.arange(len(instances)):
if instances[i] in entries_hourly:
entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
else:
entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
return entries_hourly # e.g., longitudes, entries
In [7]:
def display_basic_stats(entries_hourly_dict, column1name):
# e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
p = inflect.engine()
print "{0} AND THEIR ENTRIES".format(p.plural(column1name.upper()))
print df.head(3)
print
print pd.DataFrame(df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries_hourly_dict.values())))
return df # e.g, longitude_df
In [8]:
def plot_data(df, column1name, plot_kind, xaxis_labeled):
p = inflect.engine()
if xaxis_labeled == True:
df.plot(x=column1name, y='entries', title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
plt.xlabel(column1name)
else:
df.plot(title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
plt.xlabel("{0} row index".format(column1name))
plt.ylabel('{0} entries'.format(column1name))
plt.legend(['entries'])
plt.show()
In [9]:
def plot_histogram(df, column_name, num_of_bins):
df[column_name].plot(kind='hist', bins=num_of_bins, alpha=0.5, color='green')
plt.ylabel('frequency')
plt.show()
In [10]:
def describe_samples(sample1, sample2):
size1, min_max1, mean1, var1, skew1, kurt1 = st.describe(sample1)
size2, min_max2, mean2, var2, skew2, kurt2 = st.describe(sample2)
med1 = np.median(sample1)
med2 = np.median(sample2)
std1 = np.std(sample1)
std2 = np.std(sample2)
print "Sample 1 (rainy days):\n min = {0}, max = {1},\n mean = {2:.2f}, median = {3}, var = {4:.2f}, std = {5:.2f}".format(min_max1[0], min_max1[1], mean1, med1, var1, std1)
print "Sample 2 (non-rainy days):\n min = {0}, max = {1},\n mean = {2:.2f}, median = {3}, var = {4:.2f}, std = {5:.2f}".format(min_max2[0], min_max2[1], mean2, med2, var2, std2)
In [11]:
unit_entries_hourly = map_column_to_entries_hourly(data['UNIT'])
unit_df = display_basic_stats(unit_entries_hourly, 'unit')
plot_data(unit_df, 'unit', 'line', False)
In [12]:
unit_df.sort(columns='entries', ascending=False).head(5)
Out[12]:
In [13]:
date_entries_hourly = map_column_to_entries_hourly(data['DATEn'])
date_df = display_basic_stats(date_entries_hourly, 'date')
plot_data(date_df, 'date', 'line', False)
In [14]:
date_df.sort(columns='entries', ascending=False).head(5)
Out[14]:
In [15]:
hour_entries_hourly = map_column_to_entries_hourly(data['hour'])
hour_df = display_basic_stats(hour_entries_hourly, 'hour')
plot_data(hour_df, 'hour', 'line', True)
In [16]:
hour_df.sort(columns='entries', ascending=False).head(5)
Out[16]:
In [17]:
weekday_entries_hourly = map_column_to_entries_hourly(data['day_week'])
weekday_df = display_basic_stats(weekday_entries_hourly, 'weekday')
plot_data(weekday_df, 'weekday', 'line', True)
In [18]:
weekday_df.sort(columns='entries', ascending=False).head(5)
Out[18]:
In [19]:
data['unique_station'] = data['station'] + " (" + data['latitude'].map(str) + ", " + data['longitude'].map(str) + ")"
In [20]:
station_entries_hourly = map_column_to_entries_hourly(data['unique_station'])
station_df = display_basic_stats(station_entries_hourly, 'unique_station')
plot_data(station_df, 'unique_station', 'line', False)
In [21]:
latitude_entries_hourly = map_column_to_entries_hourly(data['latitude'])
latitude_df = display_basic_stats(latitude_entries_hourly, 'latitude')
plot_data(latitude_df, 'latitude', 'scatter', True)
In [22]:
longitude_entries_hourly = map_column_to_entries_hourly(data['longitude'])
longitude_df = display_basic_stats(longitude_entries_hourly, 'longitude')
plot_data(longitude_df, 'longitude', 'scatter', True)
The top-ten most-entered stations are the ones that someone familiar with New York City might expect (with a focus on Manhattan Island):
In [39]:
station_location = data[['unique_station', 'latitude', 'longitude']]
station_location.drop_duplicates(inplace=True)
station_location_entries = pd.merge(station_location, station_df, on='unique_station')
station_location_entries.sort(columns='entries', ascending=False, inplace=True)
top_ten = station_location_entries.head(10)
print top_ten[['unique_station', 'entries']]
plt.figure(figsize = (5,5))
plt.title('TOP-TEN-ENTERED STATIONS')
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.xlim(station_location_entries['longitude'].min(), station_location_entries['longitude'].max())
plt.ylim(station_location_entries['latitude'].min(), station_location_entries['latitude'].max())
img = plt.imread('NYmap.png')
plt.scatter(top_ten['longitude'], top_ten['latitude'], color='#00FF00', edgecolors='black', zorder=1)
plt.imshow(img, zorder=0, extent=[station_location_entries['longitude'].min(), station_location_entries['longitude'].max(), station_location_entries['latitude'].min(), station_location_entries['latitude'].max()])
plt.show()
While the above simple observations seem to indicate that non-weather-related variables have a tremendous impact on transit usage, it would only be fair to explore possible weather-related influences as well (esp. since the project guidelines expect it). Out of the weather-related data that has been made available, the two most-sensible categories to check are the (binary) rain and temperature (in Fahrenheit) variables.
In [24]:
rain_entries_hourly = map_column_to_entries_hourly(data['rain'])
rain_df = display_basic_stats(rain_entries_hourly, 'rain')
plot_data(rain_df, 'rain', 'bar', True)
In [25]:
rain_days = data[data['rain'] == 1]
no_rain_days = data[data['rain'] == 0]
print "RAIN DAYS"
print rain_days['ENTRIESn_hourly'].describe()
print
print "NO-RAIN DAYS"
print no_rain_days['ENTRIESn_hourly'].describe()
In [26]:
no_rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=18, alpha=0.5, color='yellow')
rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=15, alpha=0.7, color='blue')
plt.title('ENTRIESn_hourly HISTOGRAM (by RAIN)')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['no rain', 'rain'])
plt.show()
While distributed similarly, and, thus, comparable, non-rainy days clearly have a higher number of entries than rainy days, which might strike most readers as counter-intuitive.
In [27]:
date_and_rain = data[['DATEn', 'rain']].drop_duplicates()
date_and_rain.sort(columns='DATEn', inplace=True)
print date_and_rain.head()
dates = data['DATEn'].unique()
rain_dates = date_and_rain[date_and_rain['rain'] == 1]['DATEn'].unique()
no_rain_dates = date_and_rain[date_and_rain['rain'] == 0]['DATEn'].unique()
indices_of_rain_dates = []
for rain_date in rain_dates:
indices_of_rain_dates.append(np.where(dates == rain_date)[0][0])
indices_of_no_rain_dates = []
for no_rain_date in no_rain_dates:
indices_of_no_rain_dates.append(np.where(dates == no_rain_date)[0][0])
plt.title('RAIN AND NO-RAIN DAYS')
plt.xticks(np.arange(len(dates)), dates, rotation='vertical')
plt.yticks([0,1])
plt.xlabel('date')
plt.ylabel('rain')
plt.scatter(indices_of_rain_dates, np.ones(len(indices_of_rain_dates)), color='blue')
plt.scatter(indices_of_no_rain_dates, np.zeros(len(indices_of_no_rain_dates)), color='yellow', edgecolors='black')
plt.legend(['rain', 'no rain'], bbox_to_anchor=(1.05, 1), loc=2)
plt.show()
One possible explanation for the difference might be, simply, that there were more non-rainy days in May 2011, as indicated in the graph above. [ N.B. Certain days are reported as being rainy and non-rainy days. For a brief exploration of this phenomenon, see the *Rain* supplement. ]
What if the number of non-rainy days was limited to the total number of rainy days in the data set?
In [28]:
random_row_indices = np.random.choice(no_rain_days['ENTRIESn_hourly'].index.values, 9585)
In [29]:
no_rain_days['ENTRIESn_hourly'].loc[random_row_indices].plot(kind='hist', bins=10, alpha=0.5, color='yellow')
rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=10, alpha=0.7, color='blue')
plt.title(r'ENTRIESn_hourly HISTOGRAM (by RAIN), $n = 9585$')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['no rain', 'rain'])
plt.show()
With the number of samples and bins equal, the similarites between the two groups are obvious.
In [30]:
temp_entries_hourly = map_column_to_entries_hourly(data['tempi'])
temp_df = display_basic_stats(temp_entries_hourly, 'temp')
plot_data(temp_df, 'temp', 'scatter', True)
In [31]:
entries_rain_temp = data[['ENTRIESn_hourly', 'rain', 'tempi']].drop_duplicates()
entries_no_rain_temp = entries_rain_temp[entries_rain_temp['rain'] == 0]
entries_rain_temp = entries_rain_temp[entries_rain_temp['rain'] == 1]
plt.scatter(entries_no_rain_temp['tempi'], entries_no_rain_temp['ENTRIESn_hourly'], color='yellow', edgecolors='black', alpha=0.1)
plt.scatter(entries_rain_temp['tempi'], entries_rain_temp['ENTRIESn_hourly'], color='blue', alpha=0.1)
plt.title('TEMPERATURE, RAIN, AND ENTRIES')
plt.xlabel('temps')
plt.ylabel('total entries')
plt.show()
Based on the above scatter plots, there does seem to be some type of relationship between temperatures and number of entries. In general, temperatures between $55$°F and $66$°F received more entries.
Visually-speaking, cold and rainy days seemed to attract approximately the same number of entries as warm and non-rainy (esp. if it's remembered that there were more non-rainy days in the data set).
Taking simple random samples, where $n$: sample size, and $n = 1000$, the following basic statistics reveal themselves.
In [32]:
n = 1000
sample1 = np.random.choice(rain_days['ENTRIESn_hourly'], size=n, replace=False)
sample2 = np.random.choice(no_rain_days['ENTRIESn_hourly'], size=n, replace=False)
describe_samples(sample1,sample2)
In [33]:
plt.boxplot([sample2, sample1], vert=False)
plt.title('NUMBER OF ENTRIES PER SAMPLE')
plt.xlabel('ENTRIESn_hourly')
plt.yticks([1, 2], ['Sample 2 (no rain)', 'Sample 1 (rain)'])
plt.show()
In [34]:
plt.hist(sample1, color='blue', alpha=0.7, bins=8)
plt.hist(sample2, color='yellow', alpha=0.5, bins=10)
plt.title(r'ENTRIESn_hourly HISTOGRAM (by RAIN), $n = 1000$')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['no rain', 'rain'])
plt.show()
Treating the rain and non-rain days as two independent populations, and although visually apparent from the above histogram, the following statistical test seeks to determine whether rainy and non-rainy day distributions are normal.
The Shapiro-Wilk normality test is a test of the null hypothesis that a sample is from a population with a normal distribution.
The test confirms the visually apparent non-normality with a small-enough sample size; so here, $n = 30$.
A $95\%$ level of confidence would suggest that $95\%$ of samples would produce similar statistical results.
For a $95\%$ level of confidence, the level of significance (i.e., the probability of making a Type I error) $\alpha = (1 - 0.05) \cdot 100\% = 0.05$.
In [35]:
n = 30
alpha = 0.05
small_sample1 = np.random.choice(sample1, size=n, replace=False)
small_sample2 = np.random.choice(sample1, size=n, replace=False)
W1, p1 = st.shapiro(small_sample1)
W2, p2 = st.shapiro(small_sample2)
print "p1 < {0}: {1}".format(alpha, (p1 < alpha))
print "p2 < {0}: {1}".format(alpha, (p2 < alpha))
In both cases, $p < 0.05$, so the null hypotheses that the samples come from normally distributed populations are rejected.
Moreover, the following probability plots seal the deal (samples from normal distributions would hug the red regression line throughout the plot):
In [36]:
st.probplot(sample1, plot=plt)
plt.title('Probability Plot: Sample 1')
plt.show()
st.probplot(sample2, plot=plt)
plt.title('Probability Plot: Sample 2')
plt.show()