In [1]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.
In [2]:
import gc
from multiprocessing import Pool, cpu_count
import numpy as np
import pandas as pd
In [3]:
# from sklearn import metrics
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.preprocessing import StandardScaler
In [4]:
# train_data = np.array([]).reshape([0, 13])
# train_label = np.array([])
# test_data = np.array([]).reshape([0, 13])
# for i in range(1, 3):
# train_data = np.concatenate((train_data, pd.read_csv('../input/train%d.csv' % i, header=None).values))
# train_label = np.concatenate((train_label, pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)))
# for i in range(1, 7):
# test_data = np.concatenate((test_data, pd.read_csv('../input/test%d.csv' % i, header=None).values))
# print(train_data.shape)
# print(test_data.shape)
In [5]:
# model = RandomForestRegressor(n_estimators=20,
# n_jobs=-1,
# max_features=6,
# min_samples_split=10000,
# min_samples_leaf=10,
# max_depth=10)
# model.fit(train_data, train_label)
In [6]:
# arr = model.predict(test_data)
# df = pd.DataFrame({ 'id': range(1, arr.size + 1), 'Predicted': arr })
# df.to_csv('./rfr.csv', index=False)
In [7]:
class RandomForestRegressor(object):
"""RandomForestrRegressor
"""
def __init__(self, n_trees, sample_size, min_leaf_size=5, n_jobs=-1, max_depth=None):
self.n_trees = n_trees
self.sample_size = sample_size
self.min_leaf_size = min_leaf_size
self.n_jobs = n_jobs
self.max_depth = max_depth
self.trees = []
def get_sample_items(self):
idxs = np.random.choice(len(self.data), self.sample_size)
return self.data.iloc[idxs], self.label[idxs]
def create_trees(self):
return [DecisionTreeRegressor(self.min_leaf_size, self.max_depth) for i in range(self.n_trees)]
def fit(self, x, y, n_jobs=None):
if type(x) != pd.DataFrame:
x = pd.DataFrame(x)
self.data = x
self.label = y
if n_jobs == None:
n_jobs = self.n_jobs
self.fit_with_workers(self.fit_tree, n_jobs)
self.data = None
self.label = None
def fit_with_workers(self, fit_fn, n_jobs=-1):
try:
workers = cpu_count()
except NotImplementedError:
workers = 1
if n_jobs != -1:
workers = n_jobs
pool = Pool(processes=workers)
print('fit with %d workers' % workers)
result = pool.map(fit_fn, self.create_trees())
self.trees += list(result)
def fit_tree(self, tree):
# print('start fit_tree')
data, label = self.get_sample_items()
tree = tree.fit(data, label)
# print('stop fit_tree')
return tree
def predict(self, x, n_jobs=None):
if n_jobs == None:
n_jobs = self.n_jobs
self.predict_data = x
predicted = np.stack(self.predict_with_workers(self.predict_tree, n_jobs))
all_predicted = np.mean(predicted, axis=0)
self.predict_data = None
return all_predicted
def predict_with_workers(self, predict_fn, n_jobs=-1):
try:
workers = cpu_count()
except NotImplementedError:
workers = 1
if n_jobs != -1:
workers = n_jobs
pool = Pool(processes=workers)
print('predict with %d workers' % workers)
result = pool.map(predict_fn, self.trees)
return list(result)
def predict_tree(self, tree):
# print('start predict_tree')
predicted = tree.predict(self.predict_data)
# print('stop predict_tree')
return predicted
In [8]:
class DecisionTreeRegressor(object):
"""DecisionTreeRegressor
"""
def __init__(self, min_leaf_size, max_depth=None):
self.min_leaf_size = min_leaf_size
self.max_depth = max_depth
self.split_idx = -1
self.split_value = None
self.value = None
self.l_tree = None
self.r_tree = None
def fit(self, x, y):
self.value = np.mean(y)
if self.max_depth is not None and self.max_depth <= 1:
return self
self.split(x, y)
return self
def split(self, data, label):
split_idx, split_value, score = self.find_split_point(data, label)
if score == float('inf'):
return
self.split_idx = split_idx
self.split_value = split_value
l_idxs = np.nonzero(data.iloc[:, split_idx] <= split_value)[0]
r_idxs = np.nonzero(data.iloc[:, split_idx] > split_value)[0]
self.l_tree = DecisionTreeRegressor(self.min_leaf_size, self.max_depth - 1)
self.r_tree = DecisionTreeRegressor(self.min_leaf_size, self.max_depth - 1)
self.l_tree.fit(data.iloc[l_idxs], label[l_idxs])
self.r_tree.fit(data.iloc[r_idxs], label[r_idxs])
def find_split_point(self, data, label):
sample_size = len(data)
best_score = float('inf')
best_split_idx = -1
best_split_value = None
for idx in range(len(data.columns)):
x = data.values[:, idx]
sorted_idxs = np.argsort(x)
x = x[sorted_idxs]
y = label[sorted_idxs]
y_sum = y.sum()
y_square_sum = (y ** 2).sum()
l_sample_size = 0
l_y_sum = 0.0
l_y_square_sum = 0.0
r_sample_size = sample_size
r_y_sum = y_sum
r_y_square_sum = y_square_sum
for i in range(sample_size - self.min_leaf_size):
r_sample_size -= 1
r_y_sum -= y[i]
r_y_square_sum -= (y[i] ** 2)
l_sample_size += 1
l_y_sum += y[i]
l_y_square_sum += (y[i] ** 2)
if i < self.min_leaf_size or x[i] == x[i+1]:
continue
l_impurity = (l_y_square_sum / l_sample_size) - (l_y_sum / l_sample_size) ** 2
r_impurity = (r_y_square_sum / r_sample_size) - (r_y_sum / r_sample_size) ** 2
score = (l_impurity * l_sample_size + r_impurity * r_sample_size) / sample_size
if score < best_score:
best_score = score
best_split_value = x[i]
best_split_idx = idx
return best_split_idx, best_split_value, best_score
def predict(self, x):
if type(x) == pd.DataFrame:
x = x.values
return np.array([self.predict_row(row) for row in x])
def predict_row(self, row):
if self.l_tree == None:
return self.value
if row[self.split_idx] <= self.split_value:
return self.l_tree.predict_row(row)
else:
return self.r_tree.predict_row(row)
In [9]:
# train_data = np.array([]).reshape([0, 13])
# train_label = np.array([])
# test_data = []
# for i in range(1, 6):
# train_data = np.concatenate((train_data, pd.read_csv('../input/train%d.csv' % i, header=None).values))
# train_label = np.concatenate((train_label, pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)))
# for i in range(1, 7):
# test_data.append(np.array(pd.read_csv('../input/test%d.csv' % i, header=None).values))
# print(train_data.shape)
# print(test_data[0].shape * 6)
In [10]:
model = RandomForestRegressor(n_trees=16, sample_size=100000, min_leaf_size=300, max_depth=25)
for i in range(1, 6):
train_data = pd.read_csv('../input/train%d.csv' % i, header=None).values
train_label = pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)
model.fit(train_data, train_label)
train_data = None
train_label = None
gc.collect()
# for i in range(1, 6):
# for j in range(1, 6):
# if i == j:
# continue
# train_data = pd.read_csv('../input/train%d.csv' % i, header=None).values
# train_data = np.concatenate((train_data, pd.read_csv('../input/train%d.csv' % j, header=None).values))
# train_label = pd.read_csv('../input/label%d.csv' % i, header=None).values.squeeze(-1)
# train_label = np.concatenate((train_label, pd.read_csv('../input/label%d.csv' % j, header=None).values.squeeze(-1)))
# model.fit(train_data, train_label)
# train_data = None
# train_label = None
# gc.collect()
In [11]:
arr = np.array([])
for i in range(1, 7):
test_data = np.array(pd.read_csv('../input/test%d.csv' % i, header=None).values)
arr = np.concatenate((arr, model.predict(test_data)))
test_data = None
gc.collect()
print(arr.shape)
print(arr)
df = pd.DataFrame({ 'id': range(1, arr.size + 1), 'Predicted': arr })
df.to_csv('./rfr.csv', index=False)