notebook.community

Edit and run



In [1]:

    
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

plt.style.use('ggplot')



In [19]:

    
from sklearn.datasets import load_boston
boston = load_boston()



In [21]:

    
boston.keys()









    Out[21]:





dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])



In [22]:

    
feature_df = pd.DataFrame(data=boston['data'], columns=boston['feature_names'])



In [23]:

    
target = pd.Series(data=boston['target'], name='target')



In [24]:

    
_ = target.hist()



In [25]:

    
feature_df.head()



In [33]:

    
from tpot import TPOTRegressor


regressor = TPOTRegressor(generations=5, 
                          population_size=50, 
                          verbosity=2, 
                          n_jobs=4,
                          random_state=42, 
                          template='Regressor')

regressor.fit(feature_df.values, target.values)









    





 
 










    



Generation 1 - Current best internal CV score: -20.164673087731806
Generation 2 - Current best internal CV score: -20.164673087731806
Generation 3 - Current best internal CV score: -18.810523582154456
Generation 4 - Current best internal CV score: -18.810523582154456
Generation 5 - Current best internal CV score: -18.810523582154456

Best pipeline: GradientBoostingRegressor(input_matrix, alpha=0.9, learning_rate=0.1, loss=lad, max_depth=7, max_features=0.3, min_samples_leaf=10, min_samples_split=3, n_estimators=100, subsample=0.7500000000000001)






    Out[33]:





TPOTRegressor(config_dict=None, crossover_rate=0.1, cv=5,
              disable_update_check=False, early_stop=None, generations=5,
              max_eval_time_mins=5, max_time_mins=None, memory=None,
              mutation_rate=0.9, n_jobs=4, offspring_size=None,
              periodic_checkpoint_folder=None, population_size=50,
              random_state=42, scoring=None, subsample=1.0,
              template='Regressor', use_dask=False, verbosity=2,
              warm_start=False)



In [36]:

    
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split



In [35]:

    
m = regressor.export()
print(m)









    



import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -18.810523582154456
exported_pipeline = GradientBoostingRegressor(alpha=0.9, learning_rate=0.1, loss="lad", max_depth=7, max_features=0.3, min_samples_leaf=10, min_samples_split=3, n_estimators=100, subsample=0.7500000000000001)
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)



In [37]:

    
exported_pipeline = GradientBoostingRegressor(
    alpha=0.9, learning_rate=0.1, 
    loss="lad", max_depth=7, max_features=0.3, min_samples_leaf=10, min_samples_split=3, n_estimators=100, subsample=0.7500000000000001)


exported_pipeline.fit(feature_df.values, target.values)
results = exported_pipeline.predict(feature_df.values)



In [41]:

    
values = pd.DataFrame(data=np.column_stack([results, target.values]), columns=['predicted', 'actual'])
values.plot(kind='scatter', x='actual', y='predicted')









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fe9656eaf50>

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33