In [1]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

from sklearn.model_selection import cross_val_score
# from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
# get some data
digits = load_digits()
X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
pra

param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)


RandomizedSearchCV took 5.49 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.920 (std: 0.018)
Parameters: {'min_samples_leaf': 4, 'criterion': 'gini', 'min_samples_split': 7, 'bootstrap': False, 'max_depth': None, 'max_features': 6}

Model with rank: 2
Mean validation score: 0.919 (std: 0.021)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.918 (std: 0.011)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 5}

GridSearchCV took 55.57 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.932 (std: 0.016)
Parameters: {'min_samples_leaf': 3, 'criterion': 'entropy', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 2
Mean validation score: 0.929 (std: 0.014)
Parameters: {'min_samples_leaf': 1, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': True, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.928 (std: 0.014)
Parameters: {'min_samples_leaf': 1, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.928 (std: 0.007)
Parameters: {'min_samples_leaf': 3, 'criterion': 'entropy', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 3}


In [3]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [16]:
param_dist = {
    'randomForest': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]},
    'gradientBoosting': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}    
}

In [17]:
param_dist


Out[17]:
{'gradientBoosting': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5359e8>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x107689a58>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535c88>},
 'randomForest': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5354e0>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535860>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5355f8>}}

In [19]:
for key, val in param_dist.items():
    print(key)
    print(val)
    
    grid_search = GridSearchCV(clf, param_grid=param_grid)
    grid_search.fit(X, y)


gradientBoosting
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x107689a58>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535c88>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5359e8>}
randomForest
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535860>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5355f8>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5354e0>}

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier(nthread = -1)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="rmse", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)

In [ ]:


In [ ]:
model = XGBClassifier(nthread=-1)
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='rmse', n_jobs=-1)
print(results.mean())

In [23]:
X_train = pd.read_csv("../train_x.csv")
y_train = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")

In [39]:
num_folds = 10
num_instances = len(X_train)
seed = 10

In [ ]:


In [48]:
model = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='mean_squared_error', n_jobs=-1)
print(results.mean())


/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
-0.000201395318266

In [ ]:
clf = RandomForestRegressor(n_estimators=500)
param_grid = {"max_depth": [3, 5, 10, 15, 20],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["mse"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)
start = time()
grid_search.fit(X_train.values, np.concatenate(y_train.values))



print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
grid_search.best_score_

In [57]:
np.array(y_train)


Out[57]:
array([[ 12.24769432],
       [ 12.10901093],
       [ 12.31716669],
       ..., 
       [ 12.49312952],
       [ 11.86446223],
       [ 11.90158345]])

In [62]:
np.concatenate(y_train.values)


Out[62]:
array([ 12.24769432,  12.10901093,  12.31716669, ...,  12.49312952,
        11.86446223,  11.90158345])

In [6]:
param = {'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 1, 'bootstrap': False, 'min_samples_split': 1}

In [10]:
clf = RandomForestRegressor(n_estimators=500, criterion = 'mse', max_depth = 10, max_features = 'auto', min_samples_leaf = 1, bootstrap = False,  min_samples_split =1)

In [3]:
X = pd.read_csv("../train_x2.csv")
y = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
num_folds = 10
num_instances = len(X_train)
seed = 10

# run grid search
# grid_search = GridSearchCV(clf, param_grid=param, n_jobs = -1, cv= 8)
start = time()
clf.fit(X_train.values, np.concatenate(y_train.values))


Out[12]:
RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=1, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [13]:
result = clf.predict(X_test)

In [15]:



Out[15]:
array([ 36352.2358799 ,  36354.37073754,  37413.98292733, ...,
        35884.69144974,  37331.47252103,  36954.64964979])

In [16]:
sample = pd.read_csv("../rawData/sample_submission.csv")

In [22]:
np.exp(result)


Out[22]:
array([ 36352.2358799 ,  36354.37073754,  37413.98292733, ...,
        35884.69144974,  37331.47252103,  36954.64964979])

In [20]:
sample['SalePrice'] = np.exp(result)

In [24]:
sample.to_csv("sixth_submission.csv")

In [36]:
np.concatenate(y_train.values)


Out[36]:
array([ 12.03765399,  12.66191396,  11.88793137, ...,  12.49312952,
        11.87756858,  12.3327053 ])

In [12]:
# clf = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=0).fit(X_train, np.concatenate(y_train.values))
clf = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
            max_leaf_nodes=None, max_features = 2,
min_samples_leaf=10,
 min_weight_fraction_leaf=0.0,
           n_estimators=3000, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [13]:
clf.fit(X_train.values, np.concatenate(y_train.values))


Out[13]:
RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
           max_features=2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
est = clf.predict(X_test)

In [15]:
np.sqrt(np.mean(np.square((est - y_test.values))))


Out[15]:
0.47418450600462808

In [55]:



Out[55]:
array([ 12.23069555,  12.55672952,  12.5538817 ,  12.32374455,
        11.69107165,  11.36210258,  11.87057003,  11.81303006,
        11.88103479,  12.2713765 ,  11.74006104,  11.77528973,
        11.76745137,  12.06968002,  11.79810441,  11.7905572 ,
        11.6784399 ,  12.04941884,  12.07539432,  12.08953883,
        11.97019203,  11.81303006,  12.32829028,  12.72188581,
        12.5776362 ,  11.84934899,  12.30817787,  12.20667496,
        12.37158708,  12.46843691,  12.6698706 ,  11.77143616,
        12.14685329,  12.38833467,  12.33929149,  11.49272276,
        12.45097769,  11.8277362 ,  11.33260191,  12.25486281,
        11.83132724,  12.27834162,  11.30220443,  11.9249247 ,
        11.6351431 ,  12.14950229,  12.10052689,  11.82020422,
        12.33691811,  12.14153412,  12.77705219,  12.32374455,
        12.10901093,  12.04348899,  12.02718519,  12.13242607,
        12.23069555,  11.90158345,  12.03152079,  11.9639392 ,
        12.29778545,  11.68671249,  12.09737323,  12.06104687,
        11.81672692,  12.26434155,  11.94452229,  11.73593563,
        12.12806195,  10.5789798 ,  12.46843691,  11.84653647,
        12.08953883,  12.47772688,  11.89818787,  11.46163217,
        12.30591798,  12.26893005,  11.79433792,  11.84222921,
        12.07812561,  11.91805713,  11.86375832,  12.04348899,
        12.52088339,  11.96718074,  12.37158708,  11.8968264 ,
        11.56076279,  11.38509209,  12.04941884,  12.21106019,
        11.80335376,  11.83482774,  12.21930965,  11.72803684,
        11.9639392 ,  12.16002871,  12.15476055,  12.28740215,
        11.84510278,  12.10052689,  11.87766449,  12.36740886,
        11.27720313,  11.81204185,  10.46024211,  11.76745137,
        11.42190607,  12.66602574,  11.01862914,  12.67601377,
        11.91805713,  11.87057003,  11.97019203,  12.56024446,
        12.23069555,  11.90496755,  12.07812561,  11.86727266,
        11.72480582,  11.69524702,  11.91805713,  12.67601377,
        11.79433792,  11.60806511,  11.91170158,  12.32374455,
        12.33138284,  12.20597262,  12.18075484,  12.01974307,
        12.3413681 ,  12.09866836,  12.14153412,  12.20877561,
        12.26434155,  12.15476055,  10.858999  ,  12.65034813,
        12.01612206,  12.36307639,  12.52434221,  12.14419724,
        11.99535161,  11.69107165,  11.82407989,  12.10348606,
        12.06104687,  12.33691811,  11.66992921,  11.35040654,
        13.00865926,  12.20597262,  11.62625415,  11.28978191,
        11.92768062,  12.59556553,  12.12806195,  12.20597262,
        12.02574909,  11.94157797,  12.49113785,  11.86727266,
        12.4049235 ,  12.48582713,  12.00748752,  12.4874851 ,
        11.90834024,  12.17545746,  11.88103479,  12.01066585,
        11.73593563,  12.34583459,  11.19134184,  11.78676213,
        12.20877561,  12.16002871,  11.40756495,  12.13899755,
        11.85651517,  11.99535161,  12.01372274,  12.14950229,
        12.00130092,  12.10052689,  12.7512997 ,  12.47418956,
        12.31492705,  12.05233855,  11.84110263,  12.47772688,
        11.66992921,  11.69524702,  12.52434221,  12.82362845,
        12.17545746,  11.86375832,  11.73206099,  12.56024446,
        11.76134682,  10.97678203,  12.52434221,  11.89818787,
        11.98285988,  11.82020422,  12.36740886,  12.46651198,
        12.27834162,  11.82020422,  12.39462475,  12.31268238,
        11.92337811,  12.04941884,  11.66145321,  12.63117696,
        11.77528973,  12.85055465,  12.05805587,  11.95113737,
        12.28995413,  12.06104687,  12.59133505,  11.95113737,
        11.67419361,  11.82407989,  11.87057003,  11.68266824,
        12.31940133,  11.95761129,  12.72188581,  11.73593563,
        11.23188794,  12.38698335,  12.08672589,  12.30138283,
        12.12675884,  11.84222921,  12.06104687,  12.70350903,
        11.57355009,  11.69940503,  11.87057003,  12.53537639,
        11.9138804 ,  12.96086657,  12.37581542,  12.35233515,
        11.69524702,  11.77143616,  12.51343477,  12.19551713,
        11.71177632,  12.35449265,  11.54248427,  11.81303006,
        11.2835123 ,  12.06393288,  12.34583459,  12.25247902,
        12.41543365,  11.19821472,  11.28853113,  11.28853113,
        11.75194237,  11.84222921,  11.9639392 ,  12.19095901,
        12.09500138,  11.95113737,  11.75194237,  11.99535161,
        11.74799759,  12.17818744,  11.91805713,  11.8091001 ,
        11.71177632,  12.31043266,  11.37939407,  11.88444303,
        11.74403719,  12.76568843,  11.60806511,  12.69158046,
        11.85651517,  11.9639392 ,  12.53177279,  11.7905572 ,
        11.95113737,  12.01372274,  12.04941884,  12.10634482])

In [ ]: