In [1460]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [1461]:
print "SHAPE: " + str(data.shape)
data.head(3)
Out[1461]:
In [1462]:
print "COLUMNAR DATA TYPES"
data.dtypes
Out[1462]:
In [1463]:
data['ENTRIESn_hourly'].describe()
Out[1463]:
In [1464]:
entries_hourly_by_row = data['ENTRIESn_hourly'].values
In [1465]:
def map_column_to_entries_hourly(column):
instances = column.values # e.g., longitude_instances = data['longitude'].values
# reduce
entries_hourly = {} # e.g., longitude_entries_hourly = {}
for i in np.arange(len(instances)):
if instances[i] in entries_hourly:
entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
else:
entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
return entries_hourly # e.g., longitudes, entries
In [1466]:
def display_basic_stats(entries_hourly_dict, column1name):
# e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
p = inflect.engine()
print "{0} AND THEIR ENTRIES".format(p.plural(column1name.upper()))
print df.head(3)
print
print pd.DataFrame(df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries_hourly_dict.values())))
return df # e.g, longitude_df
In [1467]:
def plot_data(df, column1name, plot_kind, xaxis_labeled):
p = inflect.engine()
if xaxis_labeled == True:
df.plot(x=column1name, y='entries', title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5)
plt.xlabel(column1name)
else:
df.plot(title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5)
plt.xlabel("{0} row index".format(column1name))
plt.ylabel('{0} entries'.format(column1name))
plt.legend(['entries'])
plt.show()
In [1468]:
def plot_histogram(df, column_name, num_of_bins):
df[column_name].plot(kind='hist', bins=num_of_bins, alpha=0.5, color='green')
plt.ylabel('frequency')
plt.show()
In [1469]:
unit_entries_hourly = map_column_to_entries_hourly(data['UNIT'])
unit_df = display_basic_stats(unit_entries_hourly, 'unit')
plot_data(unit_df, 'unit', 'line', False)
In [1470]:
print data.groupby('UNIT')['DATEn'].count().head()
print
print data['UNIT'].describe()
[At first, there is no clear reason why UNIT row-counts differ from one another. However, given further examniation below, it seems clear that row-counts are related (if not determined) by the number of entries each unit receives, as evidenced by the most frequently occuring unit in the data set (R084) receiving the highest number of hourly entries.]
In [1594]:
units = data['UNIT'].value_counts()
print units[units == 186]
for unit in units.index:
unit_df.loc[unit_df[unit_df['unit'] == unit].index[0], 'frequency'] = units[unit]
unit_df.sort(columns=['frequency', 'unit'], ascending=False, inplace=True)
print unit_df[unit_df['frequency'] == 186].shape
In [1472]:
date_entries_hourly = map_column_to_entries_hourly(data['DATEn'])
date_df = display_basic_stats(date_entries_hourly, 'date')
plot_data(date_df, 'date', 'line', False)
In [1473]:
hour_entries_hourly = map_column_to_entries_hourly(data['hour'])
hour_df = display_basic_stats(hour_entries_hourly, 'hour')
plot_data(hour_df, 'hour', 'line', True)
In [1474]:
weekday_entries_hourly = map_column_to_entries_hourly(data['day_week'])
weekday_df = display_basic_stats(weekday_entries_hourly, 'weekday')
plot_data(weekday_df, 'weekday', 'line', True)
In [1475]:
station_entries_hourly = map_column_to_entries_hourly(data['station'])
station_df = display_basic_stats(station_entries_hourly, 'station')
plot_data(station_df, 'station', 'line', False)
In [1476]:
latitude_entries_hourly = map_column_to_entries_hourly(data['latitude'])
latitude_df = display_basic_stats(latitude_entries_hourly, 'latitude')
plot_data(latitude_df, 'latitude', 'scatter', True)
plot_histogram(latitude_df, 'latitude', 15)
In [1477]:
longitude_entries_hourly = map_column_to_entries_hourly(data['longitude'])
longitude_df = display_basic_stats(longitude_entries_hourly, 'longitude')
plot_data(longitude_df, 'longitude', 'scatter', True)
plot_histogram(longitude_df, 'longitude', 10)
In [1478]:
rain_entries_hourly = map_column_to_entries_hourly(data['rain'])
rain_df = display_basic_stats(rain_entries_hourly, 'rain')
plot_data(rain_df, 'rain', 'bar', True)
In [1479]:
rain_days = data[data['rain'] == 0]
no_rain_days = data[data['rain'] == 1]
print "RAIN DAYS"
print rain_days['ENTRIESn_hourly'].describe()
print
print "NO-RAIN DAYS"
print no_rain_days['ENTRIESn_hourly'].describe()
In [1480]:
rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=20, alpha=0.5, color='blue')
no_rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=15, alpha=0.5, color='yellow')
plt.title('ENTRIESn_hourly HISTOGRAM (by RAIN)')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['rain', 'no rain'])
plt.show()
Rainy/Non-rainy days are technically not random, although they may be considered random for most non-meterological purposes. Moreover, rainy/non-rainy days tend to cluster (for meterological reasons). In the current data set, certain days are labeled as both 'rain' and 'no rain', which, assumedly, means that rain occurrred in certain locations while it did not in others on the same day. [Thankfully, the current data set is not as double-minded when it reports (only) either 'rain' or 'no-rain' at individual station locations in a single day.]
In [1481]:
date_and_rain = data[['DATEn', 'rain']].drop_duplicates()
date_and_rain.sort(columns='DATEn', inplace=True)
print date_and_rain.head()
dates = data['DATEn'].unique()
rain_dates = date_and_rain[date_and_rain['rain'] == 1]['DATEn'].unique()
no_rain_dates = date_and_rain[date_and_rain['rain'] == 0]['DATEn'].unique()
indices_of_rain_dates = []
for rain_date in rain_dates:
indices_of_rain_dates.append(np.where(dates == rain_date)[0][0])
indices_of_no_rain_dates = []
for no_rain_date in no_rain_dates:
indices_of_no_rain_dates.append(np.where(dates == no_rain_date)[0][0])
plt.title('RAIN AND NO-RAIN DAYS')
plt.xticks(np.arange(len(dates)), dates, rotation='vertical')
plt.yticks([0,1])
plt.ylabel('rain')
plt.scatter(indices_of_rain_dates, np.ones(len(indices_of_rain_dates)), color='blue')
plt.scatter(indices_of_no_rain_dates, np.zeros(len(indices_of_no_rain_dates)), color='yellow', edgecolors='black')
plt.legend(['rain', 'no rain'], bbox_to_anchor=(1.05, 1), loc=2)
plt.show()
In [1481]:
While days-with-rain occur in greater number in this data set (thus, contributing to any possible higher-frequency counts), the distribution of ENTRIESn_hourly for rain and no-rain days seems comparable according to the above histogram.
In contrast, the non-weather-related data
In [1482]:
# perform statistical tests on rain/no-rain days to compare means and stds