In [510]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
In [511]:
print "SHAPE: " + str(data.shape)
data.head(3)
Out[511]:
In [512]:
print "COLUMNAR DATA TYPES"
data.dtypes
Out[512]:
In [513]:
data['ENTRIESn'].describe()
Out[513]:
In [514]:
data['ENTRIESn_hourly'].describe()
Out[514]:
In [538]:
### columnar data to explore
#units = data['UNIT'].unique()
#dates = data['DATEn'].unique()
#hours = data['hour'].unique()
#days_of_week = data['day_week'].unique()
#stations = data['station'].unique()
#latitudes = data['latitude'].unique()
#longitudes = data['longitude'].unique()
In [516]:
unit_instances = data['UNIT'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values
# reduce
unit_entries_hourly = {}
for i in np.arange(len(unit_instances)):
if unit_instances[i] in unit_entries_hourly:
unit_entries_hourly[unit_instances[i]] += float(entries_hourly_by_row[i])
else:
unit_entries_hourly[unit_instances[i]] = float(entries_hourly_by_row[i])
#print unit_entries_hourly
units = unit_entries_hourly.keys()
entries = unit_entries_hourly.values()
In [517]:
unit_df = pd.DataFrame(data=unit_entries_hourly.items(), columns=['unit','entries'])
print "UNITS AND THEIR ENTRIES"
print unit_df.head(3)
print
print unit_df.describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))
In [518]:
unit_df.plot(title='UNITS AND THEIR ENTRIES')
plt.xlabel('unit row index')
plt.ylabel('unit entries')
plt.show()
In [519]:
date_instances = data['DATEn'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values
# reduce
date_entries_hourly = {}
for i in np.arange(len(date_instances)):
if date_instances[i] in date_entries_hourly:
date_entries_hourly[date_instances[i]] += float(entries_hourly_by_row[i])
else:
date_entries_hourly[date_instances[i]] = float(entries_hourly_by_row[i])
dates = date_entries_hourly.keys()
entries = date_entries_hourly.values()
In [520]:
date_df = pd.DataFrame(data=date_entries_hourly.items(), columns=['date','entries'])
print "DATES AND THEIR ENTRIES"
print date_df.head(3)
print
print date_df.describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))
In [521]:
date_df.plot(title='DATES AND THEIR ENTRIES')
plt.xlabel('date row index')
plt.ylabel('date entries')
plt.show()
In [522]:
hour_instances = data['hour'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values
# reduce
hour_entries_hourly = {}
for i in np.arange(len(hour_instances)):
if hour_instances[i] in hour_entries_hourly:
hour_entries_hourly[hour_instances[i]] += float(entries_hourly_by_row[i])
else:
hour_entries_hourly[hour_instances[i]] = float(entries_hourly_by_row[i])
hours = hour_entries_hourly.keys()
entries = hour_entries_hourly.values()
In [523]:
hour_df = pd.DataFrame(data=hour_entries_hourly.items(), columns=['hour','entries'])
print "HOURS AND THEIR ENTRIES"
print hour_df.head(3)
print
print pd.DataFrame(hour_df['entries']).describe()
print "{:<7}".format('range') + "{:0<15}".format(str(np.ptp(entries)))
In [524]:
hour_df.plot(x='hour', y='entries', title='HOURS AND THEIR ENTRIES')
plt.xlabel('hour')
plt.ylabel('hour entries')
plt.legend(['entries'])
plt.ylim(np.min(entries), np.max(entries))
plt.show()
In [560]:
days_of_week_instances = data['day_week'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values
# reduce
days_of_week_entries_hourly = {}
for i in np.arange(len(days_of_week_instances)):
if days_of_week_instances[i] in days_of_week_entries_hourly:
days_of_week_entries_hourly[days_of_week_instances[i]] += float(entries_hourly_by_row[i])
else:
days_of_week_entries_hourly[days_of_week_instances[i]] = float(entries_hourly_by_row[i])
days_of_week = days_of_week_entries_hourly.keys()
entries = days_of_week_entries_hourly.values()
In [561]:
days_of_week_df = pd.DataFrame(data=days_of_week_entries_hourly.items(), columns=['day_week','entries'])
print "DAYS OF WEEK AND THEIR ENTRIES"
print days_of_week_df.head(3)
print
print pd.DataFrame(days_of_week_df['entries']).describe()
print "{:<8}".format('range') + "{:0<14}".format(str(np.ptp(entries)))
In [563]:
days_of_week_df.plot(x='day_week', y='entries', title='DAYS OF WEEK AND THEIR ENTRIES')
plt.xlabel('day of week')
plt.ylabel('day of week entries')
plt.legend(['entries'])
plt.ylim(np.min(entries), np.max(entries))
plt.show()
In [527]:
station_instances = data['station'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values
# reduce
station_entries_hourly = {}
for i in np.arange(len(station_instances)):
if station_instances[i] in station_entries_hourly:
station_entries_hourly[station_instances[i]] += float(entries_hourly_by_row[i])
else:
station_entries_hourly[station_instances[i]] = float(entries_hourly_by_row[i])
stations = station_entries_hourly.keys()
entries = station_entries_hourly.values()
In [528]:
station_df = pd.DataFrame(data=station_entries_hourly.items(), columns=['station','entries'])
print "STATIONS AND THEIR ENTRIES"
print station_df.head(3)
print
print station_df.describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))
In [558]:
station_df.plot(title='STATIONS AND THEIR ENTRIES')
plt.xlabel('station row index')
plt.ylabel('station entries')
plt.show()
In [552]:
latitude_instances = data['latitude'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values
# reduce
latitude_entries_hourly = {}
for i in np.arange(len(latitude_instances)):
if latitude_instances[i] in latitude_entries_hourly:
latitude_entries_hourly[latitude_instances[i]] += float(entries_hourly_by_row[i])
else:
latitude_entries_hourly[latitude_instances[i]] = float(entries_hourly_by_row[i])
latitudes = latitude_entries_hourly.keys()
entries = latitude_entries_hourly.values()
In [553]:
latitude_df = pd.DataFrame(data=latitude_entries_hourly.items(), columns=['latitude','entries'])
print "LATITUDES AND THEIR ENTRIES"
print latitude_df.head(3)
print
print pd.DataFrame(latitude_df['latitude']).describe()
print pd.DataFrame(latitude_df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))
In [556]:
latitude_df.plot(x='latitude', y='entries', title='LATITUDES AND THEIR ENTRIES', kind='scatter')
plt.xlabel('latitude')
plt.ylabel('latitude entries')
plt.legend(['entries'])
plt.show()
In [ ]:
In [564]:
longitude_instances = data['longitude'].values
entries_hourly_by_row = data['ENTRIESn_hourly'].values
# reduce
longitude_entries_hourly = {}
for i in np.arange(len(longitude_instances)):
if longitude_instances[i] in longitude_entries_hourly:
longitude_entries_hourly[longitude_instances[i]] += float(entries_hourly_by_row[i])
else:
longitude_entries_hourly[longitude_instances[i]] = float(entries_hourly_by_row[i])
longitudes = longitude_entries_hourly.keys()
entries = longitude_entries_hourly.values()
In [ ]:
def get_column_values_and_reduce(column1, column2):
instances =
return keys, values
In [565]:
longitude_df = pd.DataFrame(data=longitude_entries_hourly.items(), columns=['longitude','entries'])
print "LONGITUDES AND THEIR ENTRIES"
print longitude_df.head(3)
print
print pd.DataFrame(longitude_df['longitude']).describe()
print pd.DataFrame(longitude_df['entries']).describe()
print "{:<7}".format('range') + "{:0<14}".format(str(np.ptp(entries)))
In [566]:
longitude_df.plot(x='longitude', y='entries', title='LONGITUDES AND THEIR ENTRIES', kind='scatter')
plt.xlabel('longitude')
plt.ylabel('longitude entries')
plt.legend(['entries'])
plt.show()
In [530]:
#histogram of long and lat
In [530]:
In [530]:
In [530]:
In [530]:
In [530]:
In [530]: