In [1]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score
# from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
In [2]:
# get some data
digits = load_digits()
X, y = digits.data, digits.target
# build a classifier
clf = RandomForestClassifier(n_estimators=20)
# Utility function to report best scores
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(1, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=n_iter_search)
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)
# use a full grid over all parameters
pra
param_grid = {"max_depth": [3, None],
"max_features": [1, 3, 10],
"min_samples_split": [1, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
In [3]:
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
In [16]:
param_dist = {
'randomForest': {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(1, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]},
'gradientBoosting': {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(1, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
}
In [17]:
param_dist
Out[17]:
In [19]:
for key, val in param_dist.items():
print(key)
print(val)
grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X, y)
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier(nthread = -1)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="rmse", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
In [ ]:
In [ ]:
model = XGBClassifier(nthread=-1)
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='rmse', n_jobs=-1)
print(results.mean())
In [23]:
X_train = pd.read_csv("../train_x.csv")
y_train = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")
In [39]:
num_folds = 10
num_instances = len(X_train)
seed = 10
In [ ]:
In [48]:
model = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='mean_squared_error', n_jobs=-1)
print(results.mean())
In [ ]:
clf = RandomForestRegressor(n_estimators=500)
param_grid = {"max_depth": [3, 5, 10, 15, 20],
"max_features": [1, 3, 10],
"min_samples_split": [1, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["mse"]}
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)
start = time()
grid_search.fit(X_train.values, np.concatenate(y_train.values))
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
grid_search.best_score_
In [57]:
np.array(y_train)
Out[57]:
In [62]:
np.concatenate(y_train.values)
Out[62]:
In [6]:
param = {'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 1, 'bootstrap': False, 'min_samples_split': 1}
In [10]:
clf = RandomForestRegressor(n_estimators=500, criterion = 'mse', max_depth = 10, max_features = 'auto', min_samples_leaf = 1, bootstrap = False, min_samples_split =1)
In [3]:
X = pd.read_csv("../train_x2.csv")
y = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [12]:
num_folds = 10
num_instances = len(X_train)
seed = 10
# run grid search
# grid_search = GridSearchCV(clf, param_grid=param, n_jobs = -1, cv= 8)
start = time()
clf.fit(X_train.values, np.concatenate(y_train.values))
Out[12]:
In [13]:
result = clf.predict(X_test)
In [15]:
Out[15]:
In [16]:
sample = pd.read_csv("../rawData/sample_submission.csv")
In [22]:
np.exp(result)
Out[22]:
In [20]:
sample['SalePrice'] = np.exp(result)
In [24]:
sample.to_csv("sixth_submission.csv")
In [36]:
np.concatenate(y_train.values)
Out[36]:
In [12]:
# clf = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=0).fit(X_train, np.concatenate(y_train.values))
clf = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
max_leaf_nodes=None, max_features = 2,
min_samples_leaf=10,
min_weight_fraction_leaf=0.0,
n_estimators=3000, n_jobs=-1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
In [13]:
clf.fit(X_train.values, np.concatenate(y_train.values))
Out[13]:
In [14]:
est = clf.predict(X_test)
In [15]:
np.sqrt(np.mean(np.square((est - y_test.values))))
Out[15]:
In [55]:
Out[55]:
In [ ]: