In [4]:
import inflect # for string manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [5]:
entries_hourly_by_row = data['ENTRIESn_hourly'].values
def map_column_to_entries_hourly(column):
instances = column.values # e.g., longitude_instances = data['longitude'].values
# reduce
entries_hourly = {} # e.g., longitude_entries_hourly = {}
for i in np.arange(len(instances)):
if instances[i] in entries_hourly:
entries_hourly[instances[i]] += float(entries_hourly_by_row[i])
else:
entries_hourly[instances[i]] = float(entries_hourly_by_row[i])
return entries_hourly # e.g., longitudes, entries
def display_basic_stats(entries_hourly_dict, column1name):
# e.g, longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
df = pd.DataFrame(data=entries_hourly_dict.items(), columns=[column1name,'entries'])
p = inflect.engine()
print "{0} AND THEIR ENTRIES".format(p.plural(column1name.upper()))
print df.head(3)
print
print pd.DataFrame(df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries_hourly_dict.values())))
return df # e.g, longitude_df
In [6]:
unit_entries_hourly = map_column_to_entries_hourly(data['UNIT'])
unit_df = display_basic_stats(unit_entries_hourly, 'unit')
In [2]:
print data.groupby('UNIT')['DATEn'].count().head()
print
print data['UNIT'].describe()
At first, there is no clear reason why UNIT row-counts differ from one another. However, after further examination below, it seems clear that UNIT row-counts are related to (although not exactly determined by) the two entries columns in the data set, as evidenced, for example, by one of the most frequently occuring units in the data set (R084) receiving the highest number of ENTRIESn_hourly and the least frequently occuring unit in the data set (R459) having one of the lowest ENTRIESn_hourly (although, not the lowest).
In [8]:
units = data['UNIT'].value_counts()
most_freq_units = units[units == units.max()]
most_freq_units = list(most_freq_units.index.values) # 52 of these occur in the data set 186 times
most_freq_units.sort()
most_hourly_entry_units = unit_df.sort(columns='entries', ascending=False).head(52)
most_hourly_entry_units = list(most_hourly_entry_units['unit'])
most_hourly_entry_units.sort()
units_diff = []
for unit in most_freq_units:
if unit not in most_hourly_entry_units:
units_diff.append(unit)
print "In ascending order, the first five of the most frequently occurring \nunit-rows in the data set:\n" + str(most_freq_units[0:5])
print "\nBy contrast, in ascending order, the first five units with the highest \nnumber of combined entries hourly:\n" + str(most_hourly_entry_units[0:5])
print "\nTotal number of differences between sets:\n" + str(len(units_diff)) + " (out of 52)"
Clearly, the number of rows a UNIT receives in the data set and the number of ENTRIESn_hourly are not exactly aligned. As a matter of thoroughness, checking the ENTRIESn column reveals similar discrepencies.
In [9]:
entries_df = data[['UNIT', 'ENTRIESn']]
unit_entries = {}
for unit in entries_df['UNIT']:
if unit not in unit_entries:
unit_entries[unit] = entries_df[entries_df['UNIT'] == unit]['ENTRIESn'].sum()
entries_df = pd.DataFrame(data=unit_entries.items(), columns=['unit', 'entries'])
entries_df.sort(columns='entries', ascending=False, inplace=True)
highest_entries_df = entries_df.head(52)
highest_entry_units = list(highest_entries_df['unit'])
highest_entry_units.sort()
print "\nIn ascending order, the first five of the units with the highest number\nof combined entries:\n" + str(highest_entry_units[0:5])
units_diff = []
for unit in most_freq_units:
if unit not in highest_entry_units:
units_diff.append(unit)
print "\nTotal number of differences between the 52 most-frequently-occurring\nunits in the data set and the 52 units with the highest number of entries:\n" + str(len(units_diff)) + " (out of 52)"
units_diff = []
for unit in most_hourly_entry_units:
if unit not in highest_entry_units:
units_diff.append(unit)
print "\nTotal number of differences between the 52 units with the most hourly-entries and the 52 units with the highest number of entries:\n" + str(len(units_diff)) + " (out of 52)"
In summary, while there is clearly some relationship between UNIT row-counts and the two entries columns, it is unclear what that exact relationship is, or how it might prove to be useful in the current analysis. Moreover, since only a single numeric value is needed to provide the relative importance/frequency-of-use of each UNIT, only one value was used throughout the remainder of this study.
Thus, since counting rows of UNIT as a relevant metric seemed less precise than using the sum of a given unit's ENTRIESn_hourly, the defining numeric characteristic of UNIT will be indicated by ENTRIESn_hourly. [ ENTRIESn failed to be chosen to due the, in comparison, rather odd and unintuitive nature of the values it offers. ]