http://scikit-learn.org/stable/modules/preprocessing.html http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
In [57]:
%matplotlib inline
import scipy
import numpy as np
import matplotlib.pyplot as plt
import datetime
The data file looks like this:
instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
So, define a convert function to handle the date, and import it with genfromtxt The 'target' values (ie, the number of rides) are imported separately
In [58]:
convertfunc = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
data = np.genfromtxt("Bike-Sharing-Dataset/day.csv", skiprows=1,
delimiter=",", usecols= (0,1,2,3,4,5,6,7,8,9,10,11,12), converters={1: convertfunc })
data.dtype.names = ('instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit',
'temp', 'atemp', 'hum', 'windspeed')
In [59]:
target = np.genfromtxt("Bike-Sharing-Dataset/day.csv", skiprows=1,
delimiter=",", usecols= (13,14,15), names="casual,registered,cnt")
Let's spend a bit of time just exploring the data, to see what we're dealing with - so no machine learning yet, just some regular old plots
In [60]:
plt.plot(data['instant'], target['cnt'],'o')
Out[60]:
Unsurprisingly, there is an obvious seasonal signal - it also appears that the bike share program has become more popular with time (although there could be other explanations for the generally upward trend of the data).
Now, let's see if there is a difference during weekends/holidays. Naively, I'd expect that there would be more rides on the weekends.
In [61]:
plt.scatter(data['instant'], target['cnt'],s=45, c=data['workingday'], cmap='gray')
Out[61]:
By eye, there isn't a strong difference. If you look closely, though, it seems the black dots don't track the white ones completely - let's look deeper.
In [62]:
plt.scatter(data['instant'], target['registered'],s=45, c=data['workingday'], cmap='gray')
Out[62]:
In [63]:
plt.scatter(data['instant'], target['casual'],s=45, c=data['workingday'], cmap='gray')
Out[63]:
Even by eye there is clearly a difference between casual and registered users - a sensible hypothesis is that registered users are daily communters, whereas casual users are people who decide to go for a ride on a nice day.
In [64]:
plt.scatter(data['instant'], target['cnt'],s=45, c=data['weathersit'],cmap='jet')
Out[64]:
In [65]:
plt.scatter(data['instant'], target['registered'],s=45, c=data['weathersit'],cmap='jet')
Out[65]:
In [66]:
plt.scatter(data['instant'], target['casual'],s=45, c=data['weathersit'],cmap='jet')
Out[66]:
In [67]:
plt.scatter(data['instant'], target['cnt'],s=45, c=data['atemp'],cmap='jet')
Out[67]:
In [68]:
plt.scatter(data['instant'], target['registered'],s=45, c=data['atemp'],cmap='jet')
Out[68]:
In [69]:
plt.scatter(data['instant'], target['casual'],s=45, c=data['atemp'],cmap='jet')
Out[69]:
In [70]:
plt.scatter(data['instant'], target['cnt'],s=45, c=data['hum'],cmap='jet')
Out[70]:
In [71]:
plt.scatter(data['instant'], target['registered'],s=45, c=data['hum'],cmap='jet')
Out[71]:
In [72]:
plt.scatter(data['instant'], target['casual'],s=45, c=data['hum'],cmap='jet')
Out[72]:
In [73]:
X = np.zeros((data.size, 5))
for i in range(0,data.size):
X[i,:] = np.array([data[i]['workingday'], data[i]['weathersit'],
data[i]['atemp'], data[i]['hum'],data[i]['windspeed']])
In [74]:
ind = np.zeros(data.size)
for i in range(0, ind.size):
ind[i] = data[i]['dteday'].day
a = np.where(ind <= 19)
a = a[0]
b = np.where(ind > 19)
b = b[0]
Xtrain = X[a,:]
Xtest = X[b,:]
ytrain = ((target['casual'])[a])
ytest = ((target['casual'])[b])
In [75]:
from sklearn import linear_model
lasso = linear_model.Lasso(alpha = 0.1)
lasso.fit(Xtrain,ytrain).score(Xtest,ytest)
Out[75]:
In [76]:
from sklearn import svm
svr = svm.SVR(kernel='linear')
svr.fit(Xtrain,ytrain).score(Xtest,ytest)
Out[76]:
In [77]:
from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha=0.1, l1_ratio=0.7)
enet.fit(Xtrain,ytrain).score(Xtest,ytest)
Out[77]:
In [78]:
plt.scatter(data['instant'][b], target['casual'][b], c='red')
plt.scatter(data['instant'][b], lasso.predict(Xtest))
Out[78]:
In [79]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
('linear', LinearRegression(fit_intercept=False))])
model.fit(Xtrain,ytrain).score(Xtest,ytest)
Out[79]:
In [80]:
from sklearn import linear_model
lasso = linear_model.Lasso(alpha = 0.1)
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=3)),
('lasso', linear_model.Lasso(alpha=0.01))])
model.fit(Xtrain,ytrain).score(Xtest,ytest)
Out[80]:
In [ ]:
In [81]:
X = np.zeros((data.size, 7))
for i in range(0,data.size):
X[i,:] = np.array([data[i]['workingday'], data[i]['yr'], data[i]['weekday'], data[i]['weathersit'],
data[i]['atemp'], data[i]['hum'],data[i]['windspeed']])
In [82]:
ind = np.zeros(data.size)
for i in range(0, ind.size):
ind[i] = data[i]['dteday'].day
a = np.where(ind <= 19)
a = a[0]
b = np.where(ind > 19)
b = b[0]
Xtrain = X[a,:]
Xtest = X[b,:]
ytrain = ((target['casual'])[a])
ytest = ((target['casual'])[b])
In [83]:
from sklearn import linear_model
lasso = linear_model.Lasso(alpha = 0.0001)
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=3)),
('lasso', linear_model.Lasso(alpha=0.1))])
model.fit(Xtrain,ytrain).score(Xtest,ytest)
Out[83]:
In [ ]: