In [1943]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [1944]:
print "SHAPE: " + str(data.shape)
data.head(3)
Out[1944]:
In [1945]:
print "COLUMNAR DATA TYPES"
data.dtypes
Out[1945]:
In [1946]:
data['ENTRIESn_hourly'].describe()
Out[1946]:
[ N.B. Due to decisions described the Unit_Entries supplement, in the current analysis, unless otherwise noted, entries will refer to a summation of ENTRIESn_hourly per UNIT (i.e., not, as might be expected, values in the ENTRIESn column). ]
In [1947]:
entries_hourly_by_row = data['ENTRIESn_hourly'].values
In [1948]:
def map_column_to_entries_hourly(column):
instances = column.values # e.g., longitude_instances = data['longitude'].values
# reduce
entries_hourly = {} # e.g., longitude_entries_hourly = {}
for i in np.arange(len(instances)):
if instances[i] in entries_hourly:
entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
else:
entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
return entries_hourly # e.g., longitudes, entries
In [1949]:
def display_basic_stats(entries_hourly_dict, column1name):
# e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
p = inflect.engine()
print "{0} AND THEIR ENTRIES".format(p.plural(column1name.upper()))
print df.head(3)
print
print pd.DataFrame(df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries_hourly_dict.values())))
return df # e.g, longitude_df
In [1950]:
def plot_data(df, column1name, plot_kind, xaxis_labeled):
p = inflect.engine()
if xaxis_labeled == True:
df.plot(x=column1name, y='entries', title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
plt.xlabel(column1name)
else:
df.plot(title="{0} AND THEIR ENTRIES".format(p.plural(column1name.upper())), kind=plot_kind, alpha=0.5, color='green')
plt.xlabel("{0} row index".format(column1name))
plt.ylabel('{0} entries'.format(column1name))
plt.legend(['entries'])
plt.show()
In [1951]:
def plot_histogram(df, column_name, num_of_bins):
df[column_name].plot(kind='hist', bins=num_of_bins, alpha=0.5, color='green')
plt.ylabel('frequency')
plt.show()
In [1952]:
unit_entries_hourly = map_column_to_entries_hourly(data['UNIT'])
unit_df = display_basic_stats(unit_entries_hourly, 'unit')
plot_data(unit_df, 'unit', 'line', False)
UNIT SUMMARY
Clearly, certain units received more entries than other units.
In [1956]:
date_entries_hourly = map_column_to_entries_hourly(data['DATEn'])
date_df = display_basic_stats(date_entries_hourly, 'date')
plot_data(date_df, 'date', 'line', False)
DATE SUMMARY
Clearly, certain dates received more entries than other dates.
In [1957]:
hour_entries_hourly = map_column_to_entries_hourly(data['hour'])
hour_df = display_basic_stats(hour_entries_hourly, 'hour')
plot_data(hour_df, 'hour', 'line', True)
HOUR SUMMARY
Clearly, certain hours received more entries than other hours.
In [1958]:
weekday_entries_hourly = map_column_to_entries_hourly(data['day_week'])
weekday_df = display_basic_stats(weekday_entries_hourly, 'weekday')
plot_data(weekday_df, 'weekday', 'line', True)
WEEKDAY SUMMARY
Clearly, certain weekdays received more entries than other weekdays.
In [1959]:
station_entries_hourly = map_column_to_entries_hourly(data['station'])
station_df = display_basic_stats(station_entries_hourly, 'station')
plot_data(station_df, 'station', 'line', False)
STATION SUMMARY
Clearly, certain stations received more entries than other stations.
In [1960]:
latitude_entries_hourly = map_column_to_entries_hourly(data['latitude'])
latitude_df = display_basic_stats(latitude_entries_hourly, 'latitude')
plot_data(latitude_df, 'latitude', 'scatter', True)
plot_histogram(latitude_df, 'latitude', 15)
In [1961]:
longitude_entries_hourly = map_column_to_entries_hourly(data['longitude'])
longitude_df = display_basic_stats(longitude_entries_hourly, 'longitude')
plot_data(longitude_df, 'longitude', 'scatter', True)
plot_histogram(longitude_df, 'longitude', 10)
In [1962]:
rain_entries_hourly = map_column_to_entries_hourly(data['rain'])
rain_df = display_basic_stats(rain_entries_hourly, 'rain')
plot_data(rain_df, 'rain', 'bar', True)
In [1963]:
rain_days = data[data['rain'] == 0]
no_rain_days = data[data['rain'] == 1]
print "RAIN DAYS"
print rain_days['ENTRIESn_hourly'].describe()
print
print "NO-RAIN DAYS"
print no_rain_days['ENTRIESn_hourly'].describe()
In [1964]:
rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=20, alpha=0.5, color='blue')
no_rain_days['ENTRIESn_hourly'].plot(kind='hist', bins=15, alpha=0.5, color='yellow')
plt.title('ENTRIESn_hourly HISTOGRAM (by RAIN)')
plt.xlabel('ENTRIESn_hourly')
plt.ylabel('frequency')
plt.legend(['rain', 'no rain'])
plt.show()
RAIN SUMMARY
While days-with-rain occur in greater number in this data set (thus, contributing to higher-frequency counts), the distribution of ENTRIESn_hourly for rain and no-rain days seems comparable according to the above histogram.
In [1968]:
# perform statistical tests on rain/no-rain days to compare means and stds