In [44]:
from load import load
from matplotlib import pyplot as plt

In [45]:
train = load('train.csv')

In [95]:
# Windspeed
train.groupby('windspeed').mean().plot(y='count', marker='o')
plt.show()

In [77]:
# Full data plots.
plots = ['temp', 'atemp']

for plot in plots:
    plt.plot_date(train['dates'], train[plot] / train[plot].mean())

plt.show()

In [83]:
# Input dependencies.
train.plot(x='temp', y='humidity', kind='scatter')
plt.show()

Analyze temperature dependence. See how count varies depending on temperature holding all other variables constant


In [46]:
#train[['temp', 'count']]

In [103]:
# Choose a dependency to analyze from the following subset of columns
# in the dataset: ('temp', 'humidity', 'weather'). 
dependency = 'humidity'
min_stats = 3

# Limit the dependency/correlation analysis to different
# slices of the function space by adjusting the following ranges.
weather_min = 1
weather_max = 1
humidity_min = 10.0
humidity_max = 100.0
temp_min = 25.0
temp_max = 35.0
hour_min = 10
hour_max = 15
workingday = 0
holiday = 0


data = train
data = data[data['workingday'] == workingday]   
data = data[data['holiday'] == holiday]

if holiday == 1:
    title = ['Holidays']
elif workingday == 1:
    title = ['Workdays']
else:
    title = ['Weekends']

data = data[data['hour'] >= hour_min]
data = data[data['hour'] <= hour_max]
title.append('Hours: %d-%d' % (hour_min, hour_max))

if dependency != 'weather':
    title.append('Weather range: %d-%d' % (weather_min, weather_max))
    data = data[data['weather'] >= weather_min]
    data = data[data['weather'] <= weather_max]

if dependency != 'temp':
    title.append('Temp range (C): %.1f-%.1f' % (temp_min, temp_max))
    data = data[data['temp'] >= temp_min]
    data = data[data['temp'] <= temp_max]

if dependency != 'humidity':
    title.append('Humidity range (%%): %.1f-%.1f' % (humidity_min, humidity_max))
    if humidity_min is not None:
        data = data[data['humidity'] >= humidity_min]
    if humidity_max is not None:
        data = data[data['humidity'] <= humidity_max]

print 'Data points:', len(data)
if len(data) > min_stats:
    data.plot(x=dependency, y='count', kind='scatter', title=', '.join(title), ax=plt.gca())
else:
    print 'Not enough statistics!'
    
plt.show()


Data points: 221

In [99]:
train.columns


Out[99]:
Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp', u'atemp', u'humidity', u'windspeed', u'casual', u'registered', u'count', u'dates', u'hour', u'day', u'month', u'monthday', u'dayage', u'hourage', u'weekage', u'age_seconds'], dtype='object')

In [12]:
train[['temp','count','weather','windspeed','dayage']].corr()


Out[12]:
temp count weather windspeed dayage
temp 1.000000 0.394454 -0.055035 -0.017852 0.180785
count 0.394454 1.000000 -0.128655 0.101369 0.309636
weather -0.055035 -0.128655 1.000000 0.007261 -0.005017
windspeed -0.017852 0.101369 0.007261 1.000000 -0.087088
dayage 0.180785 0.309636 -0.005017 -0.087088 1.000000

In [ ]: