In [2]:
from IPython import display
URL = "https://github.com/rhiever/tpot"
display.IFrame(URL, 1000, 1000)
Out[2]:
TPOT uses a genetic algorithm (implemented with DEAP library) to pick an optimal pipeline for a regression task.
What is a pipeline?
Pipeline is composed of preprocessors:
TPOTBase is key class
parameters:
population_size: int (default: 100) The number of pipelines in the genetic algorithm population. Must be > 0.The more pipelines in the population, the slower TPOT will run, but it's also more likely to find better pipelines.
generations
parameter.TPOTClassifier and TPOTRegressor inherit parent class TPOTBase, with modifications of the scoring function.
In [1]:
!sudo pip install deap update_checker tqdm xgboost tpot
In [3]:
import pandas as pd
import numpy as np
import psycopg2
import os
import json
from tpot import TPOTClassifier
from sklearn.metrics import classification_report
In [4]:
conn = psycopg2.connect(
user = os.environ['REDSHIFT_USER']
,password = os.environ['REDSHIFT_PASS']
,port = os.environ['REDSHIFT_PORT']
,host = os.environ['REDSHIFT_HOST']
,database = 'tradesy'
)
query = """
select
purchase_dummy
,shipping_price_ratio
,asking_price
,price_level
,brand_score
,brand_size
,a_over_b
,favorite_count
,has_blurb
,has_image
,seasonal_component
,description_length
,product_category_accessories
,product_category_shoes
,product_category_bags
,product_category_tops
,product_category_dresses
,product_category_weddings
,product_category_bottoms
,product_category_outerwear
,product_category_jeans
,product_category_activewear
,product_category_suiting
,product_category_swim
from saleability_model_v2
limit 50000
"""
df = pd.read_sql(query, conn)
In [5]:
target = 'purchase_dummy'
domain = filter(lambda x: x != target, df.columns.values)
df = df.astype(float)
y_all = df[target].values
X_all = df[domain].values
idx_all = np.random.RandomState(1).permutation(len(y_all))
idx_train = idx_all[:int(.8 * len(y_all))]
idx_test = idx_all[int(.8 * len(y_all)):]
# TRAIN AND TEST DATA
X_train = X_all[idx_train]
y_train = y_all[idx_train]
X_test = X_all[idx_test]
y_test = y_all[idx_test]
In [6]:
from sklearn.ensemble import RandomForestClassifier
sklearn_model = RandomForestClassifier()
sklearn_model.fit(X_train, y_train)
Out[6]:
In [7]:
sklearn_predictions = sklearn_model.predict(X_test)
print classification_report(y_test, sklearn_predictions)
In [14]:
tpot_model = TPOTClassifier(generations=3, population_size=10, verbosity=2, max_time_mins=10)
tpot_model.fit(X_train, y_train)
In [15]:
tpot_predictions = tpot_model.predict(X_test)
print classification_report(y_test, tpot_predictions)
In [17]:
tpot_model.export('optimal-saleability-model.py')
In [18]:
!cat optimal-saleability-model.py
In [ ]:
In [ ]: