notebook.community

Edit and run



In [44]:

    
from load import load
from matplotlib import pyplot as plt



In [45]:

    
train = load('train.csv')



In [95]:

    
# Windspeed
train.groupby('windspeed').mean().plot(y='count', marker='o')
plt.show()



In [77]:

    
# Full data plots.
plots = ['temp', 'atemp']

for plot in plots:
    plt.plot_date(train['dates'], train[plot] / train[plot].mean())

plt.show()



In [83]:

    
# Input dependencies.
train.plot(x='temp', y='humidity', kind='scatter')
plt.show()

Analyze temperature dependence. See how count varies depending on temperature holding all other variables constant



In [46]:

    
#train[['temp', 'count']]



In [103]:

    
# Choose a dependency to analyze from the following subset of columns
# in the dataset: ('temp', 'humidity', 'weather'). 
dependency = 'humidity'
min_stats = 3

# Limit the dependency/correlation analysis to different
# slices of the function space by adjusting the following ranges.
weather_min = 1
weather_max = 1
humidity_min = 10.0
humidity_max = 100.0
temp_min = 25.0
temp_max = 35.0
hour_min = 10
hour_max = 15
workingday = 0
holiday = 0


data = train
data = data[data['workingday'] == workingday]   
data = data[data['holiday'] == holiday]

if holiday == 1:
    title = ['Holidays']
elif workingday == 1:
    title = ['Workdays']
else:
    title = ['Weekends']

data = data[data['hour'] >= hour_min]
data = data[data['hour'] <= hour_max]
title.append('Hours: %d-%d' % (hour_min, hour_max))

if dependency != 'weather':
    title.append('Weather range: %d-%d' % (weather_min, weather_max))
    data = data[data['weather'] >= weather_min]
    data = data[data['weather'] <= weather_max]

if dependency != 'temp':
    title.append('Temp range (C): %.1f-%.1f' % (temp_min, temp_max))
    data = data[data['temp'] >= temp_min]
    data = data[data['temp'] <= temp_max]

if dependency != 'humidity':
    title.append('Humidity range (%%): %.1f-%.1f' % (humidity_min, humidity_max))
    if humidity_min is not None:
        data = data[data['humidity'] >= humidity_min]
    if humidity_max is not None:
        data = data[data['humidity'] <= humidity_max]

print 'Data points:', len(data)
if len(data) > min_stats:
    data.plot(x=dependency, y='count', kind='scatter', title=', '.join(title), ax=plt.gca())
else:
    print 'Not enough statistics!'
    
plt.show()









    



Data points: 221



In [99]:

    
train.columns









    Out[99]:





Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp', u'atemp', u'humidity', u'windspeed', u'casual', u'registered', u'count', u'dates', u'hour', u'day', u'month', u'monthday', u'dayage', u'hourage', u'weekage', u'age_seconds'], dtype='object')



In [12]:

    
train[['temp','count','weather','windspeed','dayage']].corr()



In [ ]:

	temp	count	weather	windspeed	dayage
temp	1.000000	0.394454	-0.055035	-0.017852	0.180785
count	0.394454	1.000000	-0.128655	0.101369	0.309636
weather	-0.055035	-0.128655	1.000000	0.007261	-0.005017
windspeed	-0.017852	0.101369	0.007261	1.000000	-0.087088
dayage	0.180785	0.309636	-0.005017	-0.087088	1.000000