In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import math
from dateutil.relativedelta import relativedelta
from datetime import datetime, date
%matplotlib inline
plt.rcParams['figure.figsize']=(20,10)
plt.style.use('ggplot')
In [ ]:
In [12]:
def prepareDataForClassificationRF(dataset):
"""
generates categorical output column, which is then used
to create the train and test data
"""
X = (dataset['lotsize'])
y = (dataset['price'])
X_train = X[X.index < 400]
y_train = y[y.index < 400]
X_test = X[X.index >= 400]
y_test = y[y.index >= 400]
return X_train, y_train, X_test, y_test, dataset
In [7]:
df = pd.read_csv('../examples/Housing.csv')
df = df[['price', 'lotsize']]
In [8]:
df.tail()
Out[8]:
In [10]:
df.plot(subplots=True)
Out[10]:
In [13]:
X_train, y_train, X_test, y_test, dataset = prepareDataForClassificationRF(df)
In [28]:
RF_Model = RandomForestRegressor(n_estimators=100,
max_features=1, oob_score=True)
labels = y_train#[:, None]
features = X_train[:, None]
rgr=RF_Model.fit(features, labels)
X_test_predict=pd.DataFrame(
rgr.predict(X_test[:, None])).rename(
columns={0:'predicted_price'}).set_index('predicted_price')
X_train_predict=pd.DataFrame(
rgr.predict(X_train[:, None])).rename(
columns={0:'predicted_price'}).set_index('predicted_price')
RF_predict = X_train_predict.append(X_test_predict)
In [29]:
df = df.join((RF_predict.reset_index()))
In [30]:
df.head()
Out[30]:
In [31]:
df[['price', 'predicted_price']].plot()
Out[31]:
In [32]:
df['diff']=df.predicted_price - df.price
In [33]:
df.tail()
Out[33]:
In [34]:
df['diff'].plot(kind='bar')
Out[34]:
In [35]:
#### check R2 ###\n",
r2 = r2_score(y_train[:, None], X_train_predict.reset_index().values)
In [39]:
r2
Out[39]:
In [40]:
Out[40]:
In [ ]: