In [44]:
from load import load
from matplotlib import pyplot as plt
In [45]:
train = load('train.csv')
In [95]:
# Windspeed
train.groupby('windspeed').mean().plot(y='count', marker='o')
plt.show()
In [77]:
# Full data plots.
plots = ['temp', 'atemp']
for plot in plots:
plt.plot_date(train['dates'], train[plot] / train[plot].mean())
plt.show()
In [83]:
# Input dependencies.
train.plot(x='temp', y='humidity', kind='scatter')
plt.show()
Analyze temperature dependence. See how count varies depending on temperature holding all other variables constant
In [46]:
#train[['temp', 'count']]
In [103]:
# Choose a dependency to analyze from the following subset of columns
# in the dataset: ('temp', 'humidity', 'weather').
dependency = 'humidity'
min_stats = 3
# Limit the dependency/correlation analysis to different
# slices of the function space by adjusting the following ranges.
weather_min = 1
weather_max = 1
humidity_min = 10.0
humidity_max = 100.0
temp_min = 25.0
temp_max = 35.0
hour_min = 10
hour_max = 15
workingday = 0
holiday = 0
data = train
data = data[data['workingday'] == workingday]
data = data[data['holiday'] == holiday]
if holiday == 1:
title = ['Holidays']
elif workingday == 1:
title = ['Workdays']
else:
title = ['Weekends']
data = data[data['hour'] >= hour_min]
data = data[data['hour'] <= hour_max]
title.append('Hours: %d-%d' % (hour_min, hour_max))
if dependency != 'weather':
title.append('Weather range: %d-%d' % (weather_min, weather_max))
data = data[data['weather'] >= weather_min]
data = data[data['weather'] <= weather_max]
if dependency != 'temp':
title.append('Temp range (C): %.1f-%.1f' % (temp_min, temp_max))
data = data[data['temp'] >= temp_min]
data = data[data['temp'] <= temp_max]
if dependency != 'humidity':
title.append('Humidity range (%%): %.1f-%.1f' % (humidity_min, humidity_max))
if humidity_min is not None:
data = data[data['humidity'] >= humidity_min]
if humidity_max is not None:
data = data[data['humidity'] <= humidity_max]
print 'Data points:', len(data)
if len(data) > min_stats:
data.plot(x=dependency, y='count', kind='scatter', title=', '.join(title), ax=plt.gca())
else:
print 'Not enough statistics!'
plt.show()
In [99]:
train.columns
Out[99]:
In [12]:
train[['temp','count','weather','windspeed','dayage']].corr()
Out[12]:
In [ ]: