In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

from sklearn.ensemble.forest import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

import math
from dateutil.relativedelta import relativedelta
from datetime import datetime, date
%matplotlib inline
plt.rcParams['figure.figsize']=(20,10)
plt.style.use('ggplot')

In [ ]:


In [12]:
def prepareDataForClassificationRF(dataset):
    """
    generates categorical output column, which is then used
    to create the train and test data
    """ 
    X = (dataset['lotsize'])
    y = (dataset['price'])
    
    X_train = X[X.index < 400]
    y_train = y[y.index < 400]              
    
    X_test = X[X.index >= 400]    
    y_test = y[y.index >= 400]
    
    return X_train, y_train, X_test, y_test, dataset

In [7]:
df = pd.read_csv('../examples/Housing.csv')
df = df[['price', 'lotsize']]

In [8]:
df.tail()


Out[8]:
price lotsize
541 91500.0 4800
542 94000.0 6000
543 103000.0 6000
544 105000.0 6000
545 105000.0 6000

In [10]:
df.plot(subplots=True)


Out[10]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fe718e329d0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fe7191be550>], dtype=object)

In [13]:
X_train, y_train, X_test, y_test, dataset  = prepareDataForClassificationRF(df)

In [28]:
RF_Model = RandomForestRegressor(n_estimators=100,
                                 max_features=1, oob_score=True)
labels = y_train#[:, None]
features = X_train[:, None]
rgr=RF_Model.fit(features, labels)
X_test_predict=pd.DataFrame(
    rgr.predict(X_test[:, None])).rename(
    columns={0:'predicted_price'}).set_index('predicted_price')
X_train_predict=pd.DataFrame(
    rgr.predict(X_train[:, None])).rename(
    columns={0:'predicted_price'}).set_index('predicted_price')
RF_predict = X_train_predict.append(X_test_predict)

In [29]:
df = df.join((RF_predict.reset_index()))

In [30]:
df.head()


Out[30]:
price lotsize RFpredict predicted_price
0 42000.0 5850 53971.142857 51526.00000
1 38500.0 4000 58810.523006 58179.58937
2 49500.0 3060 47540.000000 47570.00000
3 60500.0 6650 67013.626984 65406.50000
4 61000.0 6360 74531.902525 72239.44816

In [31]:
df[['price', 'predicted_price']].plot()


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe74c400350>

In [32]:
df['diff']=df.predicted_price - df.price

In [33]:
df.tail()


Out[33]:
price lotsize RFpredict predicted_price diff
541 91500.0 4800 77032.147619 76441.033333 -15058.966667
542 94000.0 6000 94827.960101 88660.732215 -5339.267785
543 103000.0 6000 94827.960101 88660.732215 -14339.267785
544 105000.0 6000 94827.960101 88660.732215 -16339.267785
545 105000.0 6000 94827.960101 88660.732215 -16339.267785

In [34]:
df['diff'].plot(kind='bar')


Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe71501be90>

In [35]:
#### check R2 ###\n",
r2 = r2_score(y_train[:, None], X_train_predict.reset_index().values)

In [39]:
r2


Out[39]:
0.6976325043846785

In [40]:



Out[40]:
226585971.8148737

In [ ]: