In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
from copy import deepcopy
from numpy.random import randint
import random
import itertools
from operator import itemgetter
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from vf_portalytics.feature_subset import FeatureSubsetModel, FeatureSubsetTransform
from vf_portalytics.model import PredictionModel
In [2]:
def make_dataset(n_samples, n_features, n_informative, **kwargs):
x, y = make_regression(
n_samples=n_samples,
n_features=n_features,
noise=0.5,
n_informative=n_informative,
random_state=0
)
x = pd.DataFrame(x)
x.columns = ['feature_' + str(i) for i in range(n_features)]
x = x.assign(**kwargs)
return x, pd.Series(y, name='target')
# Generate data for 4 different categories
# different #samples for each category but the same #features since they belong to the same dataset
n_features = 20
x1, y1 = make_dataset(n_samples=100, n_features=n_features, n_informative=10, category='A')
x2, y2 = make_dataset(n_samples=150, n_features=n_features, n_informative=8, category='B')
x3, y3 = make_dataset(n_samples=80, n_features=n_features, n_informative=7, category='C')
x4, y4 = make_dataset(n_samples=120, n_features=n_features, n_informative=12, category='D')
# combine into one dataset
total_x = pd.concat([x1, x2, x3, x4], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3, y4], axis=0, ignore_index=True).reset_index(drop=True)
# make two random features categorical
labels = ['g1', 'g2', 'g3']
bins = [[],[]]
for i in range(2):
bins[i] = [-np.inf,
total_x['feature_' + str(i)].mean() - total_x['feature_' + str(i)].std(),
total_x['feature_' + str(i)].mean() + total_x['feature_' + str(i)].std(),
total_x['feature_' + str(i)].max()]
total_x['feature_0'] = pd.cut(total_x['feature_0'], bins=bins[0], labels=labels).astype('object')
total_x['feature_1'] = pd.cut(total_x['feature_1'], bins=bins[1], labels=labels).astype('object')
In [3]:
collumn_names = ['promoted_price', 'consumer_length',
'yearweek', 'original_product_dimension_44', 'product_volume_per_sku']
x1, y1 = make_dataset(1, 5, collumn_names, account_banner='A', product_desc='X')
x2, y2 = make_dataset(2, 3, collumn_names, account_banner='B', product_desc='Y')
# create on more that will not have sub_model and will predict 0
x3, y3 = make_dataset(3, 1, collumn_names, account_banner='C', product_desc='Z')
# combine into one dataset
total_x = pd.concat([x1, x2, x3], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3], axis=0, ignore_index=True).reset_index(drop=True)
# Split into train and test
train_index, test_index = train_test_split(total_x.index, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]
# create dictionary "predicted_market_volumes" - "lookup_dict"
lookup_dict = make_dict()
In [4]:
train_x
Out[4]:
In [5]:
lookup_dict
Out[5]:
In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
subset_cols = ('account_banner', 'product_desc')
sub_models = {
('A', 'X'): LinearRegression(),
('B', 'Y'): DecisionTreeRegressor(),
}
pipeline = Pipeline([
('transform', FeatureSubsetTransform(group_cols=subset_cols, transformer=PolynomialFeatures(2))),
('estimate', FeatureSubsetModel(lookup_dict=lookup_dict, group_cols=subset_cols, sub_models=sub_models))
])
In [7]:
# Note: must use one_hot_encode=False to prevent one-hot encoding of categorical features in input data
model_wrapper = PredictionModel("my_test_model", path='/tmp', one_hot_encode=False)
model_wrapper.model = pipeline
# save feature names (no strictly since all the preprocessing is made being made in the pipeline)
model_wrapper.features = {
# Grouping features
'account_banner': [],
'product_desc': [],
# other feaures
'promoted_price': [],
'consumer_length': [],
'yearweek': [],
'original_product_dimension_44': [],
'product_volume_per_sku': [],
}
model_wrapper.target = {'target': []}
model_wrapper.ordered_column_list = sorted(model_wrapper.features.keys())
model_wrapper.model.fit(train_x, train_y)
model_wrapper.save()
In [8]:
# Don't specify one_hot_encode here because it will be looked up from the pickle file
saved_model = PredictionModel('my_test_model', path='/tmp')
saved_model.model
Out[8]:
In [9]:
# test for the first group if the pipeline performs what we would like to
groups = train_x.groupby(by=list(subset_cols))
_, train_x = list(groups)[0]
groups = test_x.groupby(by=list(subset_cols))
_, test_x = list(groups)[0]
train_y = train_y.loc[train_x.index]
test_y = test_y.loc[test_x.index]
In [10]:
# predict with pipeline
pipeline_predicted = saved_model.model.predict(test_x)
In [11]:
# drop the columns that declare the group since we use only one group for the test
test_x.drop(list(subset_cols), axis=1, inplace=True)
train_x.drop(list(subset_cols), axis=1, inplace=True)
In [12]:
# transform price collumn
transformer = PolynomialFeatures(2)
transformer.fit(train_x[['promoted_price']])
def transform_data(data):
transformed_price = transformer.transform(data[['promoted_price']])
transformed_price = pd.DataFrame(data=transformed_price, index=data.index,
columns=transformer.get_feature_names(data.columns))
transformed_price.drop(['1', 'promoted_price'], axis=1, inplace=True)
transformed_x = pd.concat([data, transformed_price], axis=1)
return transformed_x
train_transformed = transform_data(train_x)
test_transformed = transform_data(test_x)
price_collumns = [col for col in test_transformed if col.startswith('promoted_price')]
In [13]:
# predict market share only using price related data
model = LinearRegression().fit(train_transformed[price_collumns], train_y)
predicted_market_share = model.predict(test_transformed[price_collumns])
predicted_market_share = pd.Series(index=test_transformed.index, data=predicted_market_share)
In [14]:
# predict output
test_x['predicted_market_volume'] = [lookup_dict.get((week, pr), 0)
for week, pr in [*zip(test_x['yearweek'], test_x['original_product_dimension_44'])]]
directly_predicted = predicted_market_share.mul(
test_x['predicted_market_volume']).mul(
test_x['consumer_length']).div(
test_x['product_volume_per_sku']).clip(lower=0)
In [15]:
pd.DataFrame({'directly_predicted': directly_predicted, 'pipeline_predicted': pipeline_predicted})
Out[15]: