In [1]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.


['train3.csv', 'label3.csv', 'train1.csv', 'train2.csv', 'test5.csv', 'test4.csv', 'test3.csv', 'train5.csv', 'test1.csv', 'label4.csv', 'label5.csv', 'label1.csv', 'label2.csv', 'train4.csv', 'test2.csv', 'test6.csv']

In [2]:
import gc
from multiprocessing import Pool, cpu_count
import numpy as np
import pandas as pd

1. Sklearn


In [3]:
# from sklearn import metrics
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.preprocessing import StandardScaler

In [4]:
# train_data = np.array([]).reshape([0, 13])
# train_label = np.array([])
# test_data = np.array([]).reshape([0, 13])

# for i in range(1, 3):
#     train_data = np.concatenate((train_data, pd.read_csv('../input/train%d.csv' % i, header=None).values))
#     train_label = np.concatenate((train_label, pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)))
# for i in range(1, 7):
#     test_data = np.concatenate((test_data, pd.read_csv('../input/test%d.csv' % i, header=None).values))
    
# print(train_data.shape)
# print(test_data.shape)

In [5]:
# model = RandomForestRegressor(n_estimators=20,
#                            n_jobs=-1,
#                            max_features=6,
#                            min_samples_split=10000,
#                            min_samples_leaf=10,
#                            max_depth=10)
# model.fit(train_data, train_label)

In [6]:
# arr = model.predict(test_data)
# df = pd.DataFrame({ 'id': range(1, arr.size + 1), 'Predicted': arr })
# df.to_csv('./rfr.csv', index=False)

2. RFR


In [7]:
class RandomForestRegressor(object):
    """RandomForestrRegressor
    """
    def __init__(self, n_trees, sample_size, min_leaf_size=5, n_jobs=-1, max_depth=None):
        self.n_trees = n_trees
        self.sample_size = sample_size
        self.min_leaf_size = min_leaf_size
        self.n_jobs = n_jobs
        self.max_depth = max_depth
        self.trees = []
        
    def get_sample_items(self):
        idxs = np.random.choice(len(self.data), self.sample_size)
        return self.data.iloc[idxs], self.label[idxs]
    
    def create_trees(self): 
        return [DecisionTreeRegressor(self.min_leaf_size, self.max_depth) for i in range(self.n_trees)]

    def fit(self, x, y, n_jobs=None):
        if type(x) != pd.DataFrame:
            x = pd.DataFrame(x)
        self.data = x
        self.label = y
        if n_jobs == None:
            n_jobs = self.n_jobs
        self.fit_with_workers(self.fit_tree, n_jobs)
        self.data = None
        self.label = None
        
    def fit_with_workers(self, fit_fn, n_jobs=-1):
        try:
            workers = cpu_count()
        except NotImplementedError:
            workers = 1
        if n_jobs != -1:
            workers = n_jobs
        pool = Pool(processes=workers)
        print('fit with %d workers' % workers)
        result = pool.map(fit_fn, self.create_trees())
        self.trees += list(result)
    
    def fit_tree(self, tree):
        # print('start fit_tree')
        data, label = self.get_sample_items()
        tree = tree.fit(data, label)
        # print('stop fit_tree')
        return tree
        
    def predict(self, x, n_jobs=None):
        if n_jobs == None:
            n_jobs = self.n_jobs
        self.predict_data = x
        predicted = np.stack(self.predict_with_workers(self.predict_tree, n_jobs))
        all_predicted = np.mean(predicted, axis=0)
        self.predict_data = None
        return all_predicted
    
    def predict_with_workers(self, predict_fn, n_jobs=-1):
        try:
            workers = cpu_count()
        except NotImplementedError:
            workers = 1
        if n_jobs != -1:
            workers = n_jobs
        pool = Pool(processes=workers)
        print('predict with %d workers' % workers)
        result = pool.map(predict_fn, self.trees)
        return list(result)
    
    def predict_tree(self, tree):
        # print('start predict_tree')
        predicted = tree.predict(self.predict_data)
        # print('stop predict_tree')
        return predicted

In [8]:
class DecisionTreeRegressor(object):
    """DecisionTreeRegressor
    """
    def __init__(self, min_leaf_size, max_depth=None):
        self.min_leaf_size = min_leaf_size
        self.max_depth = max_depth
        self.split_idx = -1
        self.split_value = None
        self.value = None
        self.l_tree = None
        self.r_tree = None
        
    def fit(self, x, y):
        self.value = np.mean(y)
        if self.max_depth is not None and self.max_depth <= 1:
            return self
        self.split(x, y)
        return self
    
    def split(self, data, label):
        split_idx, split_value, score = self.find_split_point(data, label)
        if score == float('inf'):
            return 
        self.split_idx = split_idx
        self.split_value = split_value
        
        l_idxs = np.nonzero(data.iloc[:, split_idx] <= split_value)[0]
        r_idxs = np.nonzero(data.iloc[:, split_idx] > split_value)[0]
        
        self.l_tree = DecisionTreeRegressor(self.min_leaf_size, self.max_depth - 1)
        self.r_tree = DecisionTreeRegressor(self.min_leaf_size, self.max_depth - 1)
        
        self.l_tree.fit(data.iloc[l_idxs], label[l_idxs])
        self.r_tree.fit(data.iloc[r_idxs], label[r_idxs])
            
    def find_split_point(self, data, label):
        sample_size = len(data)
        best_score = float('inf')
        best_split_idx = -1
        best_split_value = None
        for idx in range(len(data.columns)):
            x = data.values[:, idx]
            sorted_idxs = np.argsort(x)
            x = x[sorted_idxs]
            y =  label[sorted_idxs]
            
            y_sum = y.sum()
            y_square_sum = (y ** 2).sum()
            l_sample_size = 0
            l_y_sum = 0.0
            l_y_square_sum = 0.0
            r_sample_size = sample_size
            r_y_sum = y_sum
            r_y_square_sum = y_square_sum

            for i in range(sample_size - self.min_leaf_size):
                r_sample_size -= 1
                r_y_sum -= y[i]
                r_y_square_sum -= (y[i] ** 2)
                l_sample_size += 1
                l_y_sum += y[i]
                l_y_square_sum += (y[i] ** 2)

                if i < self.min_leaf_size or x[i] == x[i+1]:
                    continue

                l_impurity = (l_y_square_sum / l_sample_size) - (l_y_sum / l_sample_size) ** 2
                r_impurity = (r_y_square_sum / r_sample_size) - (r_y_sum / r_sample_size) ** 2
                score = (l_impurity * l_sample_size + r_impurity * r_sample_size) / sample_size

                if score < best_score:
                    best_score = score
                    best_split_value = x[i]
                    best_split_idx = idx
        return best_split_idx, best_split_value, best_score
    
    def predict(self, x):
        if type(x) == pd.DataFrame:
            x = x.values
        return np.array([self.predict_row(row) for row in x])
    
    def predict_row(self, row):
        if self.l_tree == None:
            return self.value
        if row[self.split_idx] <= self.split_value:
            return self.l_tree.predict_row(row)
        else:
            return self.r_tree.predict_row(row)

In [9]:
# train_data = np.array([]).reshape([0, 13])
# train_label = np.array([])
# test_data = []

# for i in range(1, 6):
#     train_data = np.concatenate((train_data, pd.read_csv('../input/train%d.csv' % i, header=None).values))
#     train_label = np.concatenate((train_label, pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)))
# for i in range(1, 7):
#     test_data.append(np.array(pd.read_csv('../input/test%d.csv' % i, header=None).values))
    
# print(train_data.shape)
# print(test_data[0].shape * 6)

In [10]:
model = RandomForestRegressor(n_trees=16, sample_size=100000, min_leaf_size=300, max_depth=25)
for i in range(1, 6):
    train_data = pd.read_csv('../input/train%d.csv' % i, header=None).values
    train_label = pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)
    model.fit(train_data, train_label)
    train_data = None
    train_label = None
    gc.collect()
# for i in range(1, 6):
#     for j in range(1, 6):
#         if i == j:
#             continue
#         train_data = pd.read_csv('../input/train%d.csv' % i, header=None).values
#         train_data = np.concatenate((train_data, pd.read_csv('../input/train%d.csv' % j, header=None).values))
#         train_label = pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)
#         train_label = np.concatenate((train_label, pd.read_csv('../input/label%d.csv' % j, header=None).values.squeeze(-1)))
#         model.fit(train_data, train_label)
#         train_data = None
#         train_label = None
#         gc.collect()


fit with 4 workers
fit with 4 workers
fit with 4 workers
fit with 4 workers
fit with 4 workers

In [11]:
arr = np.array([])
for i in range(1, 7):
    test_data = np.array(pd.read_csv('../input/test%d.csv' % i, header=None).values)
    arr = np.concatenate((arr, model.predict(test_data)))
    test_data = None
    gc.collect()
print(arr.shape)
print(arr)
df = pd.DataFrame({ 'id': range(1, arr.size + 1), 'Predicted': arr })
df.to_csv('./rfr.csv', index=False)


predict with 4 workers
predict with 4 workers
predict with 4 workers
predict with 4 workers
predict with 4 workers
predict with 4 workers
(10915121,)
[-0.15689916 -0.1507478  -0.20327251 ...  0.02434633 -0.03409441
  0.145811  ]