In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('bmh')
%matplotlib inline

In [2]:
import random
random.seed(42)
np.random.seed(42)

In [3]:
data = pd.read_csv('/Users/tatanakuzenko/cian_data_clear_for_modeling.csv')
data.head()


Out[3]:
Distance Floor Kitsp Livsp New Nfloors Price Rooms Totsp Metrokm ... Brick_na Distr_C Distr_E Distr_N Distr_NE Distr_NW Distr_S Distr_SE Distr_SW Distr_W
0 23.839112 2.0 8.917164 27.971880 1.0 15.0 3964118.0 2.0 47.0 1.333333 ... 0 0 0 0 0 1 0 0 0 0
1 23.839112 2.0 7.242856 16.508310 1.0 15.0 2801360.0 1.0 30.0 1.333333 ... 0 0 0 0 0 1 0 0 0 0
2 21.923458 5.0 12.000000 47.000000 0.0 8.0 13800000.0 3.0 82.0 12.500000 ... 0 0 0 0 0 1 0 0 0 0
3 23.861184 6.0 7.735300 19.879948 1.0 12.0 3302559.0 1.0 35.0 1.333333 ... 0 0 0 0 0 1 0 0 0 0
4 23.839112 10.0 7.636811 19.205621 1.0 15.0 3310465.0 1.0 34.0 1.333333 ... 0 0 0 0 0 1 0 0 0 0

5 rows × 28 columns

Готовим данные к линейной модели. Уберем n-ные столбцы после one-hot encoding, чтоб не образовывалась линейная зависимость


In [4]:
data.drop(['Bal_na', 'Distr_N', 'Brick_na'], axis = 1, inplace = True)

Вариант с квадратами: зависимость цены от каких-то признаков может быть квадратичной. Не будем исследовать очень подробно, от каких именно, просто по всем не категориальным добавим колонку "признак в квадрате". По идее тем, которые зависят скорее линейно, модель сама поставит в коэфициент при квадрате что-то маленькое.


In [5]:
data_sq = data.copy()

squared_columns = ['Distance', 'Kitsp', 'Livsp', 'Totsp', 'Metrokm']
squared_columns_new = ['Distance_sq', 'Kitsp_sq', 'Livsp_sq', 'Totsp_sq', 'Metrokm_sq']

for i in range(len(squared_columns)):
    data_sq[squared_columns_new[i]] = [x**2 for x in data_sq[squared_columns[i]]]

In [6]:
data_sq.head(3)


Out[6]:
Distance Floor Kitsp Livsp New Nfloors Price Rooms Totsp Metrokm ... Distr_NW Distr_S Distr_SE Distr_SW Distr_W Distance_sq Kitsp_sq Livsp_sq Totsp_sq Metrokm_sq
0 23.839112 2.0 8.917164 27.97188 1.0 15.0 3964118.0 2.0 47.0 1.333333 ... 1 0 0 0 0 568.303268 79.515818 782.426047 2209.0 1.777778
1 23.839112 2.0 7.242856 16.50831 1.0 15.0 2801360.0 1.0 30.0 1.333333 ... 1 0 0 0 0 568.303268 52.458962 272.524314 900.0 1.777778
2 21.923458 5.0 12.000000 47.00000 0.0 8.0 13800000.0 3.0 82.0 12.500000 ... 1 0 0 0 0 480.638029 144.000000 2209.000000 6724.0 156.250000

3 rows × 30 columns

Вариант поскейленных данных


In [7]:
from sklearn.preprocessing import scale

to_scale = ['Distance', 'Floor', 'Kitsp', 'Livsp', 'Nfloors', 'Totsp', 'Metrokm']

data_sc = data.copy()
data_sc[to_scale] = scale(data_sc[to_scale], axis = 1)

In [8]:
data_sc.head(3)


Out[8]:
Distance Floor Kitsp Livsp New Nfloors Price Rooms Totsp Metrokm ... Brick_0.0 Brick_1.0 Distr_C Distr_E Distr_NE Distr_NW Distr_S Distr_SE Distr_SW Distr_W
0 0.385663 -1.058945 -0.601390 0.659036 1.0 -0.199024 3964118.0 2.0 1.917703 -1.103043 ... 0 1 0 0 0 1 0 0 0 0
1 1.006703 -1.162404 -0.641672 0.278592 1.0 0.128784 2801360.0 1.0 1.618616 -1.228619 ... 0 1 0 0 0 1 0 0 0 0
2 -0.192105 -0.843077 -0.573817 0.772482 0.0 -0.727680 13800000.0 3.0 2.118782 -0.554585 ... 0 1 0 0 0 1 0 0 0 0

3 rows × 25 columns

В прошлом ноутбуке изучается рпспределение цен. Оно так себе - очень большая плотность в начале вариационного ряда и она быстро падает до очень маленьких значений. При таких данных имеет смысл прологарифмировать колонку и работать уже с логарифмом. Не забудем перед посчетом ошибки модели возвести экспоненту в получившуюся степень.


In [9]:
data_log = data.copy()
data_log['Price'] = np.log(data_log['Price'])

In [10]:
data_log['Price'].head()


Out[10]:
0    15.192794
1    14.845616
2    16.440179
3    15.010208
4    15.012599
Name: Price, dtype: float64

Комбинации вышеизложенных вариантов.


In [11]:
data_sc_log = data.drop('Price', axis = 1)
data_sc_log[to_scale] = scale(data_sc_log[to_scale], axis = 1)
data_sc_log['Price'] = np.log(data['Price'])

In [12]:
data_sc_log.head(3)


Out[12]:
Distance Floor Kitsp Livsp New Nfloors Rooms Totsp Metrokm Bal_0.0 ... Brick_1.0 Distr_C Distr_E Distr_NE Distr_NW Distr_S Distr_SE Distr_SW Distr_W Price
0 0.385663 -1.058945 -0.601390 0.659036 1.0 -0.199024 2.0 1.917703 -1.103043 0 ... 1 0 0 0 1 0 0 0 0 15.192794
1 1.006703 -1.162404 -0.641672 0.278592 1.0 0.128784 1.0 1.618616 -1.228619 0 ... 1 0 0 0 1 0 0 0 0 14.845616
2 -0.192105 -0.843077 -0.573817 0.772482 0.0 -0.727680 3.0 2.118782 -0.554585 0 ... 1 0 0 0 1 0 0 0 0 16.440179

3 rows × 25 columns


In [13]:
data_sq_log = data.drop('Price', axis = 1)

for i in range(len(squared_columns)):
    data_sq_log[squared_columns_new[i]] = [x**2 for x in data_sq_log[squared_columns[i]]]
    
data_sq_log['Price'] = np.log(data['Price'])

In [14]:
data_sq_log.head(3)


Out[14]:
Distance Floor Kitsp Livsp New Nfloors Rooms Totsp Metrokm Bal_0.0 ... Distr_S Distr_SE Distr_SW Distr_W Distance_sq Kitsp_sq Livsp_sq Totsp_sq Metrokm_sq Price
0 23.839112 2.0 8.917164 27.97188 1.0 15.0 2.0 47.0 1.333333 0 ... 0 0 0 0 568.303268 79.515818 782.426047 2209.0 1.777778 15.192794
1 23.839112 2.0 7.242856 16.50831 1.0 15.0 1.0 30.0 1.333333 0 ... 0 0 0 0 568.303268 52.458962 272.524314 900.0 1.777778 14.845616
2 21.923458 5.0 12.000000 47.00000 0.0 8.0 3.0 82.0 12.500000 0 ... 0 0 0 0 480.638029 144.000000 2209.000000 6724.0 156.250000 16.440179

3 rows × 30 columns


In [15]:
data_sc_sq_log = data.drop('Price', axis = 1)

data_sc_sq_log[to_scale] = scale(data_sc_sq_log[to_scale], axis = 1)

for i in range(len(squared_columns)):
    data_sc_sq_log[squared_columns_new[i]] = [x**2 for x in data_sc_sq_log[squared_columns[i]]]
    
data_sc_sq_log['Price'] = np.log(data['Price'])

In [16]:
data_sc_sq_log.head(3)


Out[16]:
Distance Floor Kitsp Livsp New Nfloors Rooms Totsp Metrokm Bal_0.0 ... Distr_S Distr_SE Distr_SW Distr_W Distance_sq Kitsp_sq Livsp_sq Totsp_sq Metrokm_sq Price
0 0.385663 -1.058945 -0.601390 0.659036 1.0 -0.199024 2.0 1.917703 -1.103043 0 ... 0 0 0 0 0.148736 0.361670 0.434329 3.677586 1.216705 15.192794
1 1.006703 -1.162404 -0.641672 0.278592 1.0 0.128784 1.0 1.618616 -1.228619 0 ... 0 0 0 0 1.013452 0.411744 0.077614 2.619918 1.509505 14.845616
2 -0.192105 -0.843077 -0.573817 0.772482 0.0 -0.727680 3.0 2.118782 -0.554585 0 ... 0 0 0 0 0.036904 0.329267 0.596729 4.489238 0.307564 16.440179

3 rows × 30 columns


In [17]:
data_sq_sc_log = data.drop('Price', axis = 1)

for i in range(len(squared_columns)):
    data_sq_sc_log[squared_columns_new[i]] = [x**2 for x in data_sq_sc_log[squared_columns[i]]]

data_sq_sc_log[to_scale] = scale(data_sq_sc_log[to_scale], axis = 1)
data_sq_sc_log['Price'] = np.log(data['Price'])

In [18]:
data.head(3)


Out[18]:
Distance Floor Kitsp Livsp New Nfloors Price Rooms Totsp Metrokm ... Brick_0.0 Brick_1.0 Distr_C Distr_E Distr_NE Distr_NW Distr_S Distr_SE Distr_SW Distr_W
0 23.839112 2.0 8.917164 27.97188 1.0 15.0 3964118.0 2.0 47.0 1.333333 ... 0 1 0 0 0 1 0 0 0 0
1 23.839112 2.0 7.242856 16.50831 1.0 15.0 2801360.0 1.0 30.0 1.333333 ... 0 1 0 0 0 1 0 0 0 0
2 21.923458 5.0 12.000000 47.00000 0.0 8.0 13800000.0 3.0 82.0 12.500000 ... 0 1 0 0 0 1 0 0 0 0

3 rows × 25 columns


In [19]:
datasets = [data, data_sq, data_sc, data_log, data_sc_log, data_sq_log, data_sc_sq_log, data_sq_sc_log]

Для каждого варианта составляем модель, обучаем ее и сопоставляем ошибки.


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

simple = LinearRegression()
squared = LinearRegression()
scaled = LinearRegression()
log = LinearRegression()
sc_log = LinearRegression()
sq_log = LinearRegression()
sc_sq_log = LinearRegression()
sq_sc_log = LinearRegression()


models = [
    
    ('simple', simple),
    ('squared', squared),
    ('scaled', scaled),
    ('log', log), 
    ('sc_log', sc_log),
    ('sq_log', sq_log),
    ('sc_sq_log', sc_sq_log),
    ('sq_sc_log', sq_sc_log)
    
]

for i in range(len(models)):
    
        tmp = datasets[i]
        train = tmp.drop('Price', axis = 1)
        target = tmp['Price']
        target = pd.DataFrame(target)
    
        Xtr, Xval, Ytr, Yval = train_test_split(train, target, test_size=0.2)
        
        modelname, model = models[i]
        model.fit(Xtr, Ytr);
        
        predictions = model.predict(Xval)
        
        
        if (modelname.find('log', 0, len(modelname)) < 0):
            rmse = sqrt(mean_squared_error(Yval, predictions))
            mae = mean_absolute_error(Yval, predictions)
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae)
        else:
            rmse = sqrt(mean_squared_error(np.exp(Yval), np.exp(predictions)))
            mae = mean_absolute_error(np.exp(Yval), np.exp(predictions))
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae)


Model: simple      RMSE:  12153541.876059694    MAE:  6432388.82077
Model: squared      RMSE:  10005147.516445344    MAE:  5545852.72922
Model: scaled      RMSE:  16583055.279802177    MAE:  10251017.0877
Model: log      RMSE:  23298500.59790149    MAE:  5640816.46911
Model: sc_log      RMSE:  16495286.562965117    MAE:  7098882.64004
Model: sq_log      RMSE:  9190883.424682342    MAE:  3599644.04623
Model: sc_sq_log      RMSE:  14839293.760010557    MAE:  6263896.23081
Model: sq_sc_log      RMSE:  46915869.24756888    MAE:  7341229.36555

При нескольких прогонках мною было подмечено, что ошибки моделей хоть и меняются, но относительно друг друга модели в среднем особо не передвигатся: лидируют simple, squared и sq_log. Теперь попробуем регуляризовать.


In [21]:
from sklearn.linear_model import Lasso

simple = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)
squared = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)
scaled = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)
log = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)
sc_log = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)
sq_log = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)
sc_sq_log = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)
sq_sc_log = Lasso(alpha = 0.01, max_iter = 100000, tol = 0.01)


models = [
    ('simple', simple),
    ('squared', squared),
    ('scaled', scaled),
    ('log', log), 
    ('sc_log', sc_log),
    ('sq_log', sq_log),
    ('sc_sq_log', sc_sq_log),
    ('sq_sc_log', sq_sc_log)
]


for i in range(len(models)):
    
        tmp = datasets[i]
        train = tmp.drop('Price', axis = 1)
        target = tmp['Price']
        target = pd.DataFrame(target)
    
        Xtr, Xval, Ytr, Yval = train_test_split(train, target, test_size=0.2)
        
        modelname, model = models[i]
        model.fit(Xtr, Ytr);
        
        predictions = model.predict(Xval)

        coef = model.coef_
        zeros = []
        for j in range(len(Xtr.columns)):
            if coef[j] == 0:
                zeros.append(Xtr.columns[j])
                
        if (modelname.find('log', 0, len(modelname)) < 0):
            rmse = sqrt(mean_squared_error(Yval, predictions))
            mae = mean_absolute_error(Yval, predictions)
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae, '   Coef: ', zeros)
        else:
            rmse = sqrt(mean_squared_error(np.exp(Yval), np.exp(predictions)))
            mae = mean_absolute_error(np.exp(Yval), np.exp(predictions))
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae, '  Coef: ', zeros)


Model: simple      RMSE:  12112815.346096275    MAE:  6576481.83687    Coef:  []
Model: squared      RMSE:  12822193.821817843    MAE:  6373621.17794    Coef:  []
/Users/tatanakuzenko/anaconda/lib/python3.4/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Model: scaled      RMSE:  16527024.07998499    MAE:  10811417.728    Coef:  []
Model: log      RMSE:  14325659.480923688    MAE:  5142047.41252   Coef:  ['Bal_0.0', 'Bal_1.0', 'Bal_2.0', 'Bal_3.0', 'Bal_4.0', 'Brick_0.0', 'Distr_E', 'Distr_NE', 'Distr_NW', 'Distr_S', 'Distr_SW']
Model: sc_log      RMSE:  15640085.517072018    MAE:  7164665.16943   Coef:  ['Floor', 'Livsp', 'Nfloors', 'Totsp', 'Metrokm', 'Bal_0.0', 'Bal_2.0', 'Bal_3.0', 'Bal_4.0', 'Distr_NW', 'Distr_S']
Model: sq_log      RMSE:  9342244.167797212    MAE:  3945027.1405   Coef:  ['Livsp', 'Bal_0.0', 'Bal_1.0', 'Bal_2.0', 'Bal_3.0', 'Bal_4.0', 'Brick_0.0', 'Brick_1.0', 'Distr_E', 'Distr_NE', 'Distr_NW', 'Distr_S', 'Distr_SW', 'Metrokm_sq']
Model: sc_sq_log      RMSE:  16676413.316755474    MAE:  7117636.66374   Coef:  ['Floor', 'Kitsp', 'Livsp', 'Nfloors', 'Totsp', 'Metrokm', 'Bal_0.0', 'Bal_2.0', 'Bal_3.0', 'Bal_4.0', 'Distr_NW', 'Distr_S', 'Livsp_sq', 'Metrokm_sq']
Model: sq_sc_log      RMSE:  18679842.94271672    MAE:  6385599.39503   Coef:  ['Floor', 'Kitsp', 'Livsp', 'Totsp', 'Metrokm', 'Bal_0.0', 'Bal_2.0', 'Bal_3.0', 'Bal_4.0', 'Distr_NE', 'Distr_NW', 'Distr_S']

В среднем RMSE была чуть меньше. Лучше остальных те же модели (вот это уже стабильно просматривалось): sq_log, simple, squared.


In [23]:
from sklearn.linear_model import Ridge

simple = Ridge(alpha = .01, max_iter = 10000)
squared = Ridge(alpha = .01, max_iter = 10000)
scaled = Ridge(alpha = .01, max_iter = 10000)
log = Ridge(alpha = .01, max_iter = 10000)
sc_log = Ridge(alpha = .01, max_iter = 10000)
sq_log = Ridge(alpha = .01, max_iter = 10000)
sc_sq_log = Ridge(alpha = .01, max_iter = 10000)
sq_sc_log = Ridge(alpha = .01, max_iter = 10000)

models = [
    ('simple', simple),
    ('squared', squared),
    ('scaled', scaled),
    ('log', log), 
    ('sc_log', sc_log),
    ('sq_log', sq_log),
    ('sc_sq_log', sc_sq_log),
    ('sq_sc_log', sq_sc_log)
]


for i in range(len(models)):
    
        tmp = datasets[i]
        train = tmp.drop('Price', axis = 1)
        target = tmp['Price']
        target = pd.DataFrame(target)
    
        Xtr, Xval, Ytr, Yval = train_test_split(train, target, test_size=0.2)
        
        modelname, model = models[i]
        model.fit(Xtr, Ytr);
        
        predictions = model.predict(Xval)
       
        if (modelname.find('log', 0, len(modelname)) < 0):
            rmse = sqrt(mean_squared_error(Yval, predictions))
            mae = mean_absolute_error(Yval, predictions)
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae)
        else:
            rmse = sqrt(mean_squared_error(np.exp(Yval), np.exp(predictions)))
            mae = mean_absolute_error(np.exp(Yval), np.exp(predictions))
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae)


Model: simple      RMSE:  11620155.486068701    MAE:  6232417.53036
Model: squared      RMSE:  13112323.630662125    MAE:  6536731.9125
Model: scaled      RMSE:  17447091.09981659    MAE:  10668545.24
Model: log      RMSE:  11399624.010805445    MAE:  4354873.71372
Model: sc_log      RMSE:  15785314.062685568    MAE:  6931541.71058
Model: sq_log      RMSE:  9651623.515418168    MAE:  4003851.62368
Model: sc_sq_log      RMSE:  17442599.171527397    MAE:  7370020.75842
Model: sq_sc_log      RMSE:  23755273.177662212    MAE:  6596393.50778

Если усреднять запуски, то результаты лучше у модели с регуляризацией, причем у Ridge. По итогам всех эспериментов наилучшей выглядит модель для sq_log данных.

Теперь запустим эти 8 вариантов для RandomForest

Подгрузим заново датасет, чтоб можно было использовать удаленные для линейных моделей колонки. Деревьям они как раз не помешают.


In [24]:
data_rf = pd.read_csv('/Users/tatanakuzenko/cian_data_clear_for_modeling.csv')

In [25]:
data_sq_rf = data_rf

for i in range(len(squared_columns_new)):
        data_sq_rf[squared_columns_new[i]] = 0

for i in range(len(squared_columns)):
    data_sq_rf[squared_columns_new[i]] = [x**2 for x in data_sq_rf[squared_columns[i]]]

In [26]:
data_sc_rf = data.copy()
data_sc_rf[to_scale] = scale(data_sc_rf[to_scale], axis = 1)

In [27]:
data_log_rf = data.copy()
data_log_rf['Price'] = np.log(data_log_rf['Price'])

In [28]:
data_sq_log_rf = data.drop('Price', axis = 1)

for i in range(len(squared_columns)):
    data_sq_log_rf[squared_columns_new[i]] = [x**2 for x in data_sq_log_rf[squared_columns[i]]]
    
data_sq_log_rf['Price'] = np.log(data['Price'])

In [29]:
data_sc_log_rf = data.drop('Price', axis = 1)
data_sc_log_rf[to_scale] = scale(data_sc_log_rf[to_scale], axis = 1)
data_sc_log_rf['Price'] = np.log(data['Price'])

In [30]:
data_sc_sq_log_rf = data.drop('Price', axis = 1)

data_sc_sq_log_rf[to_scale] = scale(data_sc_sq_log_rf[to_scale], axis = 1)

for i in range(len(squared_columns)):
    data_sc_sq_log_rf[squared_columns_new[i]] = [x**2 for x in data_sc_sq_log_rf[squared_columns[i]]]
    
data_sc_sq_log_rf['Price'] = np.log(data['Price'])

In [31]:
data_sq_sc_log_rf = data.drop('Price', axis = 1)

for i in range(len(squared_columns)):
    data_sq_sc_log_rf[squared_columns_new[i]] = [x**2 for x in data_sq_sc_log_rf[squared_columns[i]]]

data_sq_sc_log_rf[to_scale] = scale(data_sq_sc_log_rf[to_scale], axis = 1)
data_sq_sc_log_rf['Price'] = np.log(data['Price'])

In [32]:
datasets_rf = [data_rf, data_sq_rf, data_sc_rf, data_log_rf, data_sc_log_rf, data_sq_log_rf, data_sc_sq_log_rf, data_sq_sc_log_rf]

In [33]:
from sklearn.ensemble import RandomForestRegressor

simple = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)
squared = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)
scaled = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)
log = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)
sc_log = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)
sq_log = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)
sc_sq_log = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)
sq_sc_log = RandomForestRegressor(n_estimators=60, criterion='mse', max_depth=15)

models = [
    ('simple', simple),
    ('squared', squared),
    ('scaled', scaled),
    ('log', log), 
    ('sc_log', sc_log),
    ('sq_log', sq_log),
    ('sc_sq_log', sc_sq_log),
    ('sq_sc_log', sq_sc_log)
]


for i in range(len(models)):
    
        tmp = datasets[i]
        train = tmp.drop('Price', axis = 1)
        target = tmp['Price']
    
        Xtr, Xval, Ytr, Yval = train_test_split(train, target, test_size=0.2)
        
        modelname, model = models[i]
        model.fit(Xtr, Ytr);
        
        predictions = model.predict(Xval)
       
        if (modelname.find('log', 0, len(modelname)) < 0):
            rmse = sqrt(mean_squared_error(Yval, predictions))
            mae = mean_absolute_error(Yval, predictions)
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae)
        else:
            rmse = sqrt(mean_squared_error(np.exp(Yval), np.exp(predictions)))
            mae = mean_absolute_error(np.exp(Yval), np.exp(predictions))
            print('Model:', modelname, '     RMSE: ', rmse, '   MAE: ', mae)


Model: simple      RMSE:  8809104.37649344    MAE:  3517872.66469
Model: squared      RMSE:  9268194.408569915    MAE:  3729234.14066
Model: scaled      RMSE:  13527026.442443172    MAE:  5515567.09749
Model: log      RMSE:  9334100.086945478    MAE:  3297519.69191
Model: sc_log      RMSE:  12030550.307060244    MAE:  4624094.91416
Model: sq_log      RMSE:  9724020.573222538    MAE:  3516194.14099
Model: sc_sq_log      RMSE:  13742529.60333157    MAE:  5235121.73636
Model: sq_sc_log      RMSE:  10413170.775034726    MAE:  3780036.39832

На лесах сопоставимо хорошие результаты дают все, кроме scaled, sc_log, sc_sq_log

В среднем суммарная ошибка составляет 105e05 на 1000 объектов для предсказаний линейных моделей и 97e05 для случайных лесов.