notebook.community

Edit and run



In [27]:

    
import numpy as np

from time import time
from scipy.stats import randint as sp_randint
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits

from sklearn.model_selection import cross_val_score
# from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor



In [2]:

    
# get some data
digits = load_digits()
X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
pra

param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)









    



RandomizedSearchCV took 5.49 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.920 (std: 0.018)
Parameters: {'min_samples_leaf': 4, 'criterion': 'gini', 'min_samples_split': 7, 'bootstrap': False, 'max_depth': None, 'max_features': 6}

Model with rank: 2
Mean validation score: 0.919 (std: 0.021)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.918 (std: 0.011)
Parameters: {'min_samples_leaf': 3, 'criterion': 'gini', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 5}

GridSearchCV took 55.57 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.932 (std: 0.016)
Parameters: {'min_samples_leaf': 3, 'criterion': 'entropy', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 2
Mean validation score: 0.929 (std: 0.014)
Parameters: {'min_samples_leaf': 1, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': True, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.928 (std: 0.014)
Parameters: {'min_samples_leaf': 1, 'criterion': 'gini', 'min_samples_split': 3, 'bootstrap': False, 'max_depth': None, 'max_features': 10}

Model with rank: 3
Mean validation score: 0.928 (std: 0.007)
Parameters: {'min_samples_leaf': 3, 'criterion': 'entropy', 'min_samples_split': 1, 'bootstrap': False, 'max_depth': None, 'max_features': 3}



In [3]:

    
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")



In [16]:

    
param_dist = {
    'randomForest': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]},
    'gradientBoosting': {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}    
}



In [17]:

    
param_dist









    Out[17]:





{'gradientBoosting': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5359e8>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x107689a58>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535c88>},
 'randomForest': {'bootstrap': [True, False],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, None],
  'max_features': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5354e0>,
  'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f535860>,
  'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen at 0x10f5355f8>}}



In [19]:

    
for key, val in param_dist.items():
    print(key)
    print(val)
    
    grid_search = GridSearchCV(clf, param_grid=param_grid)
    grid_search.fit(X, y)









    



gradientBoosting
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x107689a58>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535c88>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5359e8>}
randomForest
{'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f535860>, 'criterion': ['gini', 'entropy'], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5355f8>, 'bootstrap': [True, False], 'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f5354e0>}



In [ ]:

    
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
model = XGBClassifier(nthread = -1)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="rmse", eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)



In [ ]:



In [ ]:

    
model = XGBClassifier(nthread=-1)
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='rmse', n_jobs=-1)
print(results.mean())



In [23]:

    
X_train = pd.read_csv("../train_x.csv")
y_train = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")



In [39]:

    
num_folds = 10
num_instances = len(X_train)
seed = 10



In [ ]:



In [48]:

    
model = RandomForestRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='mean_squared_error', n_jobs=-1)
print(results.mean())









    



/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/model_selection/_validation.py:238: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/Users/daniel/anaconda3/envs/tensorflow2/lib/python3.5/site-packages/sklearn/metrics/scorer.py:90: DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)






    



-0.000201395318266



In [ ]:

    
clf = RandomForestRegressor(n_estimators=500)
param_grid = {"max_depth": [3, 5, 10, 15, 20],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["mse"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs = -1, cv= 8)
start = time()
grid_search.fit(X_train.values, np.concatenate(y_train.values))



print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)
grid_search.best_score_



In [57]:

    
np.array(y_train)









    Out[57]:





array([[ 12.24769432],
       [ 12.10901093],
       [ 12.31716669],
       ..., 
       [ 12.49312952],
       [ 11.86446223],
       [ 11.90158345]])



In [62]:

    
np.concatenate(y_train.values)









    Out[62]:





array([ 12.24769432,  12.10901093,  12.31716669, ...,  12.49312952,
        11.86446223,  11.90158345])



In [6]:

    
param = {'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 1, 'bootstrap': False, 'min_samples_split': 1}



In [10]:

    
clf = RandomForestRegressor(n_estimators=500, criterion = 'mse', max_depth = 10, max_features = 'auto', min_samples_leaf = 1, bootstrap = False,  min_samples_split =1)



In [30]:

    
X = pd.read_csv("../train_x.csv")
y = pd.read_csv("../train_y.csv")
X_test = pd.read_csv("../test_x.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [12]:

    
num_folds = 10
num_instances = len(X_train)
seed = 10

# run grid search
# grid_search = GridSearchCV(clf, param_grid=param, n_jobs = -1, cv= 8)
start = time()
clf.fit(X_train.values, np.concatenate(y_train.values))









    Out[12]:





RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=1, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)



In [13]:

    
result = clf.predict(X_test)



In [15]:









    Out[15]:





array([ 36352.2358799 ,  36354.37073754,  37413.98292733, ...,
        35884.69144974,  37331.47252103,  36954.64964979])



In [16]:

    
sample = pd.read_csv("../rawData/sample_submission.csv")



In [22]:

    
np.exp(result)









    Out[22]:





array([ 36352.2358799 ,  36354.37073754,  37413.98292733, ...,
        35884.69144974,  37331.47252103,  36954.64964979])



In [20]:

    
sample['SalePrice'] = np.exp(result)



In [24]:

    
sample.to_csv("sixth_submission.csv")



In [36]:

    
np.concatenate(y_train.values)









    Out[36]:





array([ 12.03765399,  12.66191396,  11.88793137, ...,  12.49312952,
        11.87756858,  12.3327053 ])



In [101]:

    
# clf = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=0).fit(X_train, np.concatenate(y_train.values))
clf = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=1,
            max_leaf_nodes=None, max_features = 2,
           min_impurity_split=1e-07, min_samples_leaf=10,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           n_estimators=3000, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)



In [102]:

    
clf.fit(X_train.values, np.concatenate(y_train.values))









    Out[102]:





RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=1,
           max_features=2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=10, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)



In [103]:

    
est = clf.predict(X_test)



In [104]:

    
np.sqrt(np.mean(np.square((est - y_test.values))))









    Out[104]:





0.38328915730167745



In [55]:









    Out[55]:





array([ 12.23069555,  12.55672952,  12.5538817 ,  12.32374455,
        11.69107165,  11.36210258,  11.87057003,  11.81303006,
        11.88103479,  12.2713765 ,  11.74006104,  11.77528973,
        11.76745137,  12.06968002,  11.79810441,  11.7905572 ,
        11.6784399 ,  12.04941884,  12.07539432,  12.08953883,
        11.97019203,  11.81303006,  12.32829028,  12.72188581,
        12.5776362 ,  11.84934899,  12.30817787,  12.20667496,
        12.37158708,  12.46843691,  12.6698706 ,  11.77143616,
        12.14685329,  12.38833467,  12.33929149,  11.49272276,
        12.45097769,  11.8277362 ,  11.33260191,  12.25486281,
        11.83132724,  12.27834162,  11.30220443,  11.9249247 ,
        11.6351431 ,  12.14950229,  12.10052689,  11.82020422,
        12.33691811,  12.14153412,  12.77705219,  12.32374455,
        12.10901093,  12.04348899,  12.02718519,  12.13242607,
        12.23069555,  11.90158345,  12.03152079,  11.9639392 ,
        12.29778545,  11.68671249,  12.09737323,  12.06104687,
        11.81672692,  12.26434155,  11.94452229,  11.73593563,
        12.12806195,  10.5789798 ,  12.46843691,  11.84653647,
        12.08953883,  12.47772688,  11.89818787,  11.46163217,
        12.30591798,  12.26893005,  11.79433792,  11.84222921,
        12.07812561,  11.91805713,  11.86375832,  12.04348899,
        12.52088339,  11.96718074,  12.37158708,  11.8968264 ,
        11.56076279,  11.38509209,  12.04941884,  12.21106019,
        11.80335376,  11.83482774,  12.21930965,  11.72803684,
        11.9639392 ,  12.16002871,  12.15476055,  12.28740215,
        11.84510278,  12.10052689,  11.87766449,  12.36740886,
        11.27720313,  11.81204185,  10.46024211,  11.76745137,
        11.42190607,  12.66602574,  11.01862914,  12.67601377,
        11.91805713,  11.87057003,  11.97019203,  12.56024446,
        12.23069555,  11.90496755,  12.07812561,  11.86727266,
        11.72480582,  11.69524702,  11.91805713,  12.67601377,
        11.79433792,  11.60806511,  11.91170158,  12.32374455,
        12.33138284,  12.20597262,  12.18075484,  12.01974307,
        12.3413681 ,  12.09866836,  12.14153412,  12.20877561,
        12.26434155,  12.15476055,  10.858999  ,  12.65034813,
        12.01612206,  12.36307639,  12.52434221,  12.14419724,
        11.99535161,  11.69107165,  11.82407989,  12.10348606,
        12.06104687,  12.33691811,  11.66992921,  11.35040654,
        13.00865926,  12.20597262,  11.62625415,  11.28978191,
        11.92768062,  12.59556553,  12.12806195,  12.20597262,
        12.02574909,  11.94157797,  12.49113785,  11.86727266,
        12.4049235 ,  12.48582713,  12.00748752,  12.4874851 ,
        11.90834024,  12.17545746,  11.88103479,  12.01066585,
        11.73593563,  12.34583459,  11.19134184,  11.78676213,
        12.20877561,  12.16002871,  11.40756495,  12.13899755,
        11.85651517,  11.99535161,  12.01372274,  12.14950229,
        12.00130092,  12.10052689,  12.7512997 ,  12.47418956,
        12.31492705,  12.05233855,  11.84110263,  12.47772688,
        11.66992921,  11.69524702,  12.52434221,  12.82362845,
        12.17545746,  11.86375832,  11.73206099,  12.56024446,
        11.76134682,  10.97678203,  12.52434221,  11.89818787,
        11.98285988,  11.82020422,  12.36740886,  12.46651198,
        12.27834162,  11.82020422,  12.39462475,  12.31268238,
        11.92337811,  12.04941884,  11.66145321,  12.63117696,
        11.77528973,  12.85055465,  12.05805587,  11.95113737,
        12.28995413,  12.06104687,  12.59133505,  11.95113737,
        11.67419361,  11.82407989,  11.87057003,  11.68266824,
        12.31940133,  11.95761129,  12.72188581,  11.73593563,
        11.23188794,  12.38698335,  12.08672589,  12.30138283,
        12.12675884,  11.84222921,  12.06104687,  12.70350903,
        11.57355009,  11.69940503,  11.87057003,  12.53537639,
        11.9138804 ,  12.96086657,  12.37581542,  12.35233515,
        11.69524702,  11.77143616,  12.51343477,  12.19551713,
        11.71177632,  12.35449265,  11.54248427,  11.81303006,
        11.2835123 ,  12.06393288,  12.34583459,  12.25247902,
        12.41543365,  11.19821472,  11.28853113,  11.28853113,
        11.75194237,  11.84222921,  11.9639392 ,  12.19095901,
        12.09500138,  11.95113737,  11.75194237,  11.99535161,
        11.74799759,  12.17818744,  11.91805713,  11.8091001 ,
        11.71177632,  12.31043266,  11.37939407,  11.88444303,
        11.74403719,  12.76568843,  11.60806511,  12.69158046,
        11.85651517,  11.9639392 ,  12.53177279,  11.7905572 ,
        11.95113737,  12.01372274,  12.04941884,  12.10634482])



In [ ]: