In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
%pylab inline
In [2]:
c_cycle=("#3498db","#e74c3c","#1abc9c","#9b59b6","#f1c40f","#ecf0f1","#34495e",
"#446cb3","#d24d57","#27ae60","#663399", "#f7ca18","#bdc3c7","#2c3e50")
mpl.rc('font', family='Bitstream Vera Sans', size=20)
mpl.rc('lines', linewidth=2,color="#2c3e50")
mpl.rc('patch', linewidth=0,facecolor="none",edgecolor="none")
mpl.rc('text', color='#2c3e50')
mpl.rc('axes', facecolor='none',edgecolor="none",titlesize=25,labelsize=15,color_cycle=c_cycle,grid=False)
mpl.rc('xtick.major',size=10,width=0)
mpl.rc('ytick.major',size=10,width=0)
mpl.rc('xtick.minor',size=10,width=0)
mpl.rc('ytick.minor',size=10,width=0)
mpl.rc('ytick',direction="out")
mpl.rc('grid',color='#c0392b',alpha=0.3,linewidth=1)
mpl.rc('legend',numpoints=3,fontsize=15,borderpad=0,markerscale=3,labelspacing=0.2,frameon=False,framealpha=0.6,handlelength=1,handleheight=0.5)
mpl.rc('figure',figsize=(10,6),dpi=80,facecolor="none",edgecolor="none")
mpl.rc('savefig',dpi=100,facecolor="none",edgecolor="none")
In [3]:
weather = pd.read_table("daily_weather.tsv")
usage = pd.read_table("usage_2012.tsv")
stations = pd.read_table("stations.tsv")
In [4]:
weather.loc[weather['season_code'] == 1, 'season_desc'] = 'winter'
weather.loc[weather['season_code'] == 2, 'season_desc'] = 'spring'
weather.loc[weather['season_code'] == 3, 'season_desc'] = 'summer'
weather.loc[weather['season_code'] == 4, 'season_desc'] = 'fall'
In [5]:
weather['date'] = pd.to_datetime(weather['date'])
In [6]:
month_rental = weather.groupby(weather['date'].dt.month)['total_riders'].sum()
In [7]:
mean = weather.groupby('season_desc')['temp'].mean()
In [8]:
count = usage['station_start'].value_counts()
In [9]:
average_rental_df = DataFrame({ 'average_rental' : count / 365})
In [10]:
average_rental_df
Out[10]:
In [11]:
from sklearn import linear_model
In [12]:
indexed_avg_df = DataFrame(average_rental_df.index, columns=['station'])
In [13]:
indexed_avg_df['avg_rentals'] = average_rental_df.values
In [14]:
indexed_avg_df['station'] = average_rental_df.index
In [15]:
avgerage_stations_df = pd.merge(left=indexed_avg_df, right=stations, on='station')
In [16]:
x = avgerage_stations_df[list(avgerage_stations_df.columns.values[8:])]
y = avgerage_stations_df[list(avgerage_stations_df.columns.values[1:2])]
In [17]:
linear_reg = linear_model.LinearRegression()
linear_reg.fit(x, y)
Out[17]:
In [18]:
plt.scatter(y, linear_reg.predict(x), s=50)
plt.show()
In [19]:
linear_reg.coef_
Out[19]:
In [20]:
from sklearn.cross_validation import train_test_split
In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)
In [22]:
lin_reg = linear_model.LinearRegression()
lin_reg.fit(x_train, y_train)
Out[22]:
In [23]:
plt.scatter(lin_reg.predict(x_test), y_test, s=50)
plt.xlabel('predicted value')
plt.ylabel('actual value')
plt.show()
It looks too scattered and doesn't look like accurate.
In [24]:
from sklearn.linear_model import Lasso
In [25]:
lasso_model = Lasso(alpha=0.1)
In [26]:
lasso_model.fit(x_train, y_train)
Out[26]:
In [27]:
lasso_model.coef_
Out[27]:
In [28]:
lasso_model = Lasso(alpha=0.5)
In [29]:
lasso_model.fit(x_train, y_train)
Out[29]:
In [30]:
lasso_model.coef_
Out[30]:
In [31]:
lasso_model = Lasso(alpha=1)
In [32]:
lasso_model.fit(x_train, y_train)
Out[32]:
In [33]:
lasso_model.coef_
Out[33]:
In [34]:
plt.scatter(lasso_model.predict(x_test), y_test, s=50)
plt.xlabel('predicted value')
plt.ylabel('actual value')
plt.show()
I can see some correlation in this.
In [35]:
x = avgerage_stations_df[list(avgerage_stations_df.columns.values[111:112])]
y = avgerage_stations_df[list(avgerage_stations_df.columns.values[1:2])]
lin_regr = linear_model.LinearRegression()
lin_regr.fit(x, y)
Out[35]:
In [36]:
plt.scatter(lin_regr.predict(x), y, s=50)
plt.xlabel('predicted value')
plt.ylabel('actual value')
plt.show()
This looks like reasonably correlated.
In [ ]: