In [15]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [16]:
entries_hourly_by_row = data['ENTRIESn_hourly'].values
In [17]:
def map_column_to_entries_hourly(column):
instances = column.values # e.g., longitude_instances = data['longitude'].values
# reduce
entries_hourly = {} # e.g., longitude_entries_hourly = {}
for i in np.arange(len(instances)):
if instances[i] in entries_hourly:
entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
else:
entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
return entries_hourly # e.g., longitudes, entries
In [18]:
def create_df(entries_hourly_dict, column1name):
# e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
return df # e.g, longitude_df
In [19]:
rain_entries_hourly = map_column_to_entries_hourly(data['rain'])
rain_df = create_df(rain_entries_hourly, 'rain')
rain_days = data[data['rain'] == 1]
no_rain_days = data[data['rain'] == 0]
In [20]:
def plot_box(sample1, sample2):
plt.boxplot([sample2, sample1], vert=False)
plt.title('NUMBER OF ENTRIES PER SAMPLE')
plt.xlabel('ENTRIESn_hourly')
plt.yticks([1, 2], ['Sample 2', 'Sample 1'])
plt.show()
In [21]:
def describe_samples(sample1, sample2):
size1, min_max1, mean1, var1, skew1, kurt1 = st.describe(sample1)
size2, min_max2, mean2, var2, skew2, kurt2 = st.describe(sample2)
med1 = np.median(sample1)
med2 = np.median(sample2)
std1 = np.std(sample1)
std2 = np.std(sample2)
print "Sample 1 (rainy days):\n min = {0}, max = {1},\n mean = {2:.2f}, median = {3}, var = {4:.2f}, std = {5:.2f}".format(min_max1[0], min_max1[1], mean1, med1, var1, std1)
print "Sample 2 (non-rainy days):\n min = {0}, max = {1},\n mean = {2:.2f}, median = {3}, var = {4:.2f}, std = {5:.2f}".format(min_max2[0], min_max2[1], mean2, med2, var2, std2)
In [22]:
class MannWhitneyU:
def __init__(self,n):
self.n = n
self.num_of_tests = 1000
self.sample1 = 0
self.sample2 = 0
def sample_and_test(self, plot, describe):
self.sample1 = np.random.choice(rain_days['ENTRIESn_hourly'], size=self.n, replace=False)
self.sample2 = np.random.choice(no_rain_days['ENTRIESn_hourly'], size=self.n, replace=False)
### the following two self.sample2 assignments are for testing purposes ###
#self.sample2 = self.sample1 # test when samples are same
#self.sample2 = np.random.choice(np.random.randn(self.n),self.n) # test for when samples are very different
if plot == True:
plot_box(self.sample1,self.sample2)
if describe == True:
describe_samples(self.sample1,self.sample2)
return st.mannwhitneyu(self.sample1, self.sample2)
def effect_sizes(self, U):
# Wendt's rank-biserial correlation
r = (1 - np.true_divide((2*U),(self.n*self.n)))
# Cohen's d
s = np.sqrt(np.true_divide((((self.n-1)*np.std(self.sample1)**2) + ((self.n-1)*np.std(self.sample2)**2)), (self.n+self.n-2)))
d = np.true_divide((np.mean(self.sample1) - np.mean(self.sample2)), s)
return r,d
def trial_series(self):
success = 0
U_values = []
p_values = []
d_values = []
r_values = []
for i in np.arange(self.num_of_tests):
U, p = self.sample_and_test(False, False)
r, d = self.effect_sizes(U)
U_values.append(U)
# scipy.stats.mannwhitneyu returns p for a one-sided hypothesis,
# so multiply by 2 for two-sided
p_values.append(p*2)
d_values.append(d)
r_values.append(r)
if p <= 0.05:
success += 1
print "n = {0}".format(self.n)
print "average U value: {0:.2f}".format(np.mean(U_values))
print "number of times p <= 0.05: {0}/{1} ({2}%)".format(success, self.num_of_tests, (np.true_divide(success,self.num_of_tests)*100))
print "average p value: {0:.2f}".format(np.mean(p_values))
print "average rank-biserial r value: {0:.2f}".format(np.mean(r_values))
print "average Cohen's d value: {0:.2f}".format(np.mean(d_values))
plt.hist(p_values, color='green', alpha=0.3)
plt.show()
The Mann-Whitney $U$ test was used to determine if there was a statistically significant difference between the number of reported entries on rainy and non-rainy occasions. This nonparametric test of the equality of two population medians from independent samples was used since the distribution of entries is non-normal (right-skewed) and their shape is the same, as seen visually via histograms, probability plots, and box plots, and as the result of the Shapiro-Wilk normality test (see Preparation for Statistical Tests). However, since the sample sizes are so large, the parametric Welch's $t$-test likely could have been used (and, it was implemented for confirmation purposes, along with the nonparametric Wilcoxon signed-rank test; both agreed with the Mann-Whitney $U$ test results).
In [23]:
sample_sizes = [30, 100, 500, 1500, 3000, 5000, 9585]
for n in sample_sizes:
MannWhitneyU(n).trial_series()
As witnessed above, when rainy and non-rainy days from the data set are considered populations (as opposed to samples themselves), it takes significantly large sample sizes from each population (e.g., $n = 3000$, which is more than $30\%$ of the total number of rainy days in the data set) to attain low $p$-values1 frequently enough to reject the null hypothesis of the Mann-Whitney $U$ test2 with the critical values proposed below.
Moreover, using Wendt's rank-biserial correlation $r$ and Cohen's $d$ to measure effect size, the relatively low average value of $r$3 and the low average value of $d$4 both suggest that the difference between the two samples (and, thus, the two populations) is trivial, even though, according to the Mann-Whitney U test, the difference appears to be statistically signficant (and only then with extremely large samples)5. In other words, statistical significance $\neq$ practical significance.
1 Identical samples would produce a large $p$ (e.g., $p \approx 0.49$); extremely different samples would produce a very small number (e.g., $p \approx 0$).
2 Identical samples would produce $U = \frac{n^{2}}{2}$ (e.g., when $n = 450$, $U = 101250$); extremely different samples can produce a $U$ that is orders of magnitude smaller (e.g., when $n = 450$, possibly $U = 1293$).
3 For very different samples, $r \rightarrow 1$; in the above tests, as $n$ increases, $r \rightarrow 0$.
4 For very different samples, $d \rightarrow 1$; in the above tests, as $n$ increases, $d$ tends to remain constant, $d \approx 0.06$, even when the sample size is extremely large. $d$ is interpreted as the difference in the number of standard deviations.
5 On the issue of $p$-values and large data sets, see Lin, M., Lucas, H.C., and Shmueli, G. Research Commentary—Too big to fail: Large samples and the P-value problem. Inf. Syst. Res. 2013; 24: 906–917. PDF here.
A two-tail $p$-value was selected since an appropriate initial question, given the results of the Weather-Related Data section of the DataExploration supplement, is simply whether or not there is a statistically significant difference between the populations (i.e., not whether one population is statistically-significantly greater than another).
In [24]:
print "Shape of rainy-days data:" +str(rain_days.shape)
N = rain_days.shape[0]
print "N = " + str(N)
print "0.05 * N = " + str(0.05 * N)
The Mann-Whitney $U$ test is a nonparametric test of the null hypothesis that the distributions of two populations are the same.
To verify the assumption that the simple, randomly sampled values are independent, the sample sizes should be less than $5\%$ of the population sizes ($n \lt 0.05N$). Since the maximum number of rainy days is $9585$ ($N = 9585$), a reasonable sample size for each group would be $450$ ($n = 450$).
Null Hyptohesis
$H_{0}$: $M_{1} = M_{2}$ or $H_{0}$: $M_{1} - M_{2} = 0$
Alternate Hypothesis
$H_{1}$: $M_{1} \neq M_{2}$ or $H_{1}$: $M_{1} - M_{2} \neq 0$
A $95\%$ level of confidence would suggest that $95\%$ of samples would produce similar statistical results.
For a $95\%$ level of confidence, the level of significance (i.e., the probability of making a Type I error) $\alpha = (1 - 0.05) \cdot 100\% = 0.05$.
$p \leq 0.05$
In [25]:
n = 450
mwu = MannWhitneyU(n)
U, p = mwu.sample_and_test(True,True)
r, d = mwu.effect_sizes(U)
print "\nMann-Whitney U test results:"
print "n = {0}".format(n)
print "U = {0}".format(U)
print "p = {0:.2f}".format(np.mean(p))
print "rank-biserial r value: {0:.2f}".format(np.mean(r))
print "Cohen's d value: {0:.2f}".format(np.mean(d))
[ N.B. The following values will change each time this notebook is run; however, the final results should not differ. ]
$p = 0.34$
Sample 1 (rainy days)
$\bar{x}_{1} = 1805.17$, $M_{1} = 922.5$
Sample 2 (non-rainy days)
$\bar{x}_{2} = 1733.25$, $M_{2} = 921.0$
The difference between the number of entries on rainy and non-rainy days ($n_{1} = n_{2} = 450$) from the data set is not statistically significant based on a two-independent-sample Mann-Whitney $U$ test using scipy.stats.mannwhitneyu ($U$ is extremely high [closer to its max value, $U = 101250$, than its min value] and $p$ is signicantly greater than the proposed critical value $0.05$). In addition, Wendt's rank-biserial correlation $r$ and Cohen's $d$ both indicate an essentially non-existent effect size.
Thus, there does not appear to be either a statistical or practical difference between rainy days and non-rainy days.