In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
%matplotlib inline
In [96]:
weather = pd.read_table('daily_weather.tsv')
usage = pd.read_table('usage_2012.tsv')
stations = pd.read_table('stations.tsv')
In [97]:
station_counts = usage.groupby('station_start')['station_start'].count()
station_rentals_per_day = DataFrame()
station_rentals_per_day['rentals'] = station_counts.values / 366.0
station_rentals_per_day['station'] = station_counts.index
In [98]:
station_rentals_per_day.head()
Out[98]:
In [99]:
s = stations[['station']]
u = pd.concat([usage['station_start']], axis=1, keys=['station'])
counts = u['station'].value_counts()
c = DataFrame(counts.index, columns=['station'])
c['counts'] = counts.values
c['counts'] = c['counts'].apply(lambda x: x / 366)
m = pd.merge(s, c, on='station')
stations_data = stations.merge(m, on='station')
In [100]:
df = DataFrame(stations_data.index, columns=['station'])
df['avg_rentals'] = m[['counts']]
df['station'] = m[['station']]
stations_vals = pd.merge(left=df, right=stations, on='station')
In [101]:
x = stations_vals[list(stations_vals.columns.values[8:])]
y = stations_vals[list(stations_vals.columns.values[1:2])]
linear_regression = linear_model.LinearRegression()
linear_regression.fit(x, y)
Out[101]:
In [102]:
plt.scatter(linear_regression.predict(x), y)
plt.xlabel('predicted values')
plt.ylabel('actual values')
plt.show()
In [103]:
linear_regression.coef_
Out[103]:
Outlier coeeficents would be inacurate at predicting.
In [104]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
lin_regr = linear_model.LinearRegression()
lin_regr.fit(x_train, y_train)
Out[104]:
In [105]:
plt.scatter(lin_regr.predict(x_test), y_test)
plt.xlabel('predicted values')
plt.ylabel('actual values')
plt.show()
In [106]:
plt.scatter(y_test, lin_regr.predict(x_test) )
Out[106]:
Too many outliers. Would be innacurate.
In [107]:
model = Lasso(alpha=.1)
model.fit(x_train, y_train)
np.round(model.coef_, 1)
Out[107]:
In [108]:
model = Lasso(alpha=.5)
model.fit(x_train, y_train)
np.round(model.coef_, 1)
Out[108]:
In [109]:
plt.scatter(lin_regr.predict(x_test), y_test)
plt.xlabel('predicted values')
plt.ylabel('actual values')
plt.show()
Better.
In [110]:
x = stations_vals[list(stations_vals.columns.values[111:112])]
y = stations_vals[list(stations_vals.columns.values[1:2])]
lin_regr = linear_model.LinearRegression()
lin_regr.fit(x, y)
Out[110]:
In [115]:
plt.scatter(lin_regr.predict(x), y)
plt.xlabel('predicted value')
plt.ylabel('actual value')
plt.show()
In [ ]:
In [ ]: