In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
from copy import deepcopy
from numpy.random import randint
import random
import itertools 
from operator import itemgetter

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from vf_portalytics.feature_subset import FeatureSubsetModel, FeatureSubsetTransform
from vf_portalytics.model import PredictionModel

In [2]:
def make_dataset(n_samples, n_features, n_informative, **kwargs):
    x, y = make_regression(
        n_samples=n_samples, 
        n_features=n_features,
        noise=0.5,
        n_informative=n_informative, 
        random_state=0
    )
    x = pd.DataFrame(x)
    
    x.columns = ['feature_' + str(i) for i in range(n_features)]
    x = x.assign(**kwargs)
    return x, pd.Series(y, name='target')


# Generate data for 4 different categories
# different #samples for each category but the same #features since they belong to the same dataset
n_features = 20
x1, y1 = make_dataset(n_samples=100, n_features=n_features, n_informative=10, category='A')
x2, y2 = make_dataset(n_samples=150, n_features=n_features, n_informative=8, category='B')
x3, y3 = make_dataset(n_samples=80, n_features=n_features, n_informative=7, category='C')
x4, y4 = make_dataset(n_samples=120, n_features=n_features, n_informative=12, category='D')

# combine into one dataset
total_x = pd.concat([x1, x2, x3, x4], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3, y4], axis=0, ignore_index=True).reset_index(drop=True)

# make two random features categorical
labels = ['g1', 'g2', 'g3']
bins = [[],[]]
for i in range(2):
    bins[i] = [-np.inf, 
               total_x['feature_' + str(i)].mean() - total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].mean() + total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].max()]
total_x['feature_0'] = pd.cut(total_x['feature_0'], bins=bins[0], labels=labels).astype('object')
total_x['feature_1'] = pd.cut(total_x['feature_1'], bins=bins[1], labels=labels).astype('object')

Generate data and lookup dictionary


In [3]:
collumn_names = ['promoted_price', 'consumer_length', 
                 'yearweek',  'original_product_dimension_44', 'product_volume_per_sku']

x1, y1 = make_dataset(1, 5, collumn_names, account_banner='A', product_desc='X')
x2, y2 = make_dataset(2, 3, collumn_names, account_banner='B', product_desc='Y')
# create on more that will not have sub_model and will predict 0
x3, y3 = make_dataset(3, 1, collumn_names, account_banner='C', product_desc='Z')

# combine into one dataset
total_x = pd.concat([x1, x2, x3], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3], axis=0, ignore_index=True).reset_index(drop=True)
# Split into train and test
train_index, test_index = train_test_split(total_x.index, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]

# create dictionary "predicted_market_volumes" - "lookup_dict"
lookup_dict = make_dict()

In [4]:
train_x


Out[4]:
promoted_price consumer_length yearweek original_product_dimension_44 product_volume_per_sku account_banner product_desc
2528 1.240198 -0.588810 46 0 -0.258654 C Z
2828 -1.721130 -0.633242 18 1 -0.251092 C Z
2137 -2.306269 1.947770 33 0 0.895523 C Z
2637 -0.303963 -0.400043 50 0 -0.559406 C Z
135 1.529248 0.686483 26 0 -0.081570 A X
... ... ... ... ... ... ... ...
2121 -0.504628 0.043220 43 1 -1.403318 C Z
1424 -0.722067 0.466792 25 1 -3.326870 B Y
1725 -0.389445 -0.658218 48 1 1.398478 B Y
2254 0.350929 -0.856347 53 0 -0.523437 C Z
2915 -1.847872 0.576796 25 1 -2.980150 C Z

2250 rows × 7 columns


In [5]:
lookup_dict


Out[5]:
{(1, 0): 0.6663326653306614,
 (2, 0): -2.0691382765531063,
 (3, 0): -0.9168336673346693,
 (4, 0): -1.8386773547094188,
 (5, 0): 0.4458917835671343,
 (6, 0): -0.9869739478957917,
 (7, 0): 2.409819639278557,
 (8, 0): 0.7965931863727453,
 (9, 0): 0.8967935871743484,
 (10, 0): -1.1472945891783568,
 (11, 0): -1.4579158316633267,
 (12, 0): -0.3356713426853708,
 (13, 0): 1.227454909819639,
 (14, 0): 2.4398797595190382,
 (15, 0): -0.015030060120240663,
 (16, 0): -2.009018036072144,
 (17, 0): -0.746492985971944,
 (18, 0): -2.0691382765531063,
 (19, 0): -1.3276553106212425,
 (20, 0): -1.3376753507014028,
 (21, 0): 1.1573146292585168,
 (22, 0): -1.3176352705410823,
 (23, 0): -1.1472945891783568,
 (24, 0): 1.0671342685370742,
 (25, 0): -2.3597194388777556,
 (26, 0): 1.8787575150300597,
 (27, 0): 0.39579158316633256,
 (28, 0): 1.3677354709418834,
 (29, 0): -1.6683366733466936,
 (30, 0): 0.7765531062124249,
 (31, 0): 0.6663326653306614,
 (32, 0): 0.405811623246493,
 (33, 0): 1.9789579158316633,
 (34, 0): 2.0490981963927855,
 (35, 0): -1.9188376753507015,
 (36, 0): -0.5360721442885772,
 (37, 0): 1.1573146292585168,
 (38, 0): -2.299599198396794,
 (39, 0): 1.4979959919839678,
 (40, 0): 0.06513026052104198,
 (41, 0): 1.0871743486973946,
 (42, 0): -1.1272545090180361,
 (43, 0): 2.0290581162324646,
 (44, 0): -1.3977955911823647,
 (45, 0): 1.1372745490981964,
 (46, 0): -1.9589178356713428,
 (47, 0): 0.39579158316633256,
 (48, 0): 1.0270541082164328,
 (49, 0): 1.8987975951903806,
 (50, 0): 1.447895791583166,
 (51, 0): 0.35571142284569124,
 (52, 0): -2.299599198396794,
 (53, 0): -0.5861723446893787}

Create pipeline


In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures


subset_cols = ('account_banner', 'product_desc')
sub_models = {
    ('A', 'X'): LinearRegression(),
    ('B', 'Y'): DecisionTreeRegressor(),
}


pipeline = Pipeline([  
  ('transform', FeatureSubsetTransform(group_cols=subset_cols, transformer=PolynomialFeatures(2))),
  ('estimate', FeatureSubsetModel(lookup_dict=lookup_dict, group_cols=subset_cols, sub_models=sub_models))
])

Create VF Model Wrapper and Save pipeline


In [7]:
# Note: must use one_hot_encode=False to prevent one-hot encoding of categorical features in input data
model_wrapper = PredictionModel("my_test_model", path='/tmp', one_hot_encode=False)

model_wrapper.model = pipeline
# save feature names (no strictly since all the preprocessing is made being made in the pipeline)
model_wrapper.features = {
    # Grouping features
    'account_banner': [],
    'product_desc': [],
    # other feaures
    'promoted_price': [],
    'consumer_length': [],
    'yearweek': [],
    'original_product_dimension_44': [],
    'product_volume_per_sku': [],
}
model_wrapper.target = {'target': []}
model_wrapper.ordered_column_list = sorted(model_wrapper.features.keys())

model_wrapper.model.fit(train_x, train_y)

model_wrapper.save()


/home/ctselas/anaconda3/lib/python3.7/site-packages/sklearn/dummy.py:132: FutureWarning: The default value of strategy will change from stratified to prior in 0.24.
  "stratified to prior in 0.24.", FutureWarning)

Load Pre-Saved Model


In [8]:
# Don't specify one_hot_encode here because it will be looked up from the pickle file
saved_model = PredictionModel('my_test_model', path='/tmp')
saved_model.model


Out[8]:
Pipeline(memory=None,
         steps=[('transform',
                 FeatureSubsetTransform(group_cols=('account_banner',
                                                    'product_desc'),
                                        transformer=PolynomialFeatures(degree=2,
                                                                       include_bias=True,
                                                                       interaction_only=False,
                                                                       order='C'))),
                ('estimate',
                 FeatureSubsetModel(group_cols=('account_banner',
                                                'product_desc'),
                                    lookup_dict={(1, 0): 0.6663326653306614,
                                                 (2, 0): -2.0691382765531063,
                                                 (3, 0): -...
                                                ('B', 'Y'): DecisionTreeRegressor(ccp_alpha=0.0,
                                                                                  criterion='mse',
                                                                                  max_depth=None,
                                                                                  max_features=None,
                                                                                  max_leaf_nodes=None,
                                                                                  min_impurity_decrease=0.0,
                                                                                  min_impurity_split=None,
                                                                                  min_samples_leaf=1,
                                                                                  min_samples_split=2,
                                                                                  min_weight_fraction_leaf=0.0,
                                                                                  presort='deprecated',
                                                                                  random_state=None,
                                                                                  splitter='best'),
                                                ('C', 'Z'): DummyClassifier(constant=0,
                                                                            random_state=None,
                                                                            strategy='warn')}))],
         verbose=False)

Test the results


In [9]:
# test for the first group if the pipeline performs what we would like to
groups = train_x.groupby(by=list(subset_cols))
_, train_x = list(groups)[0]

groups = test_x.groupby(by=list(subset_cols))
_, test_x = list(groups)[0]

train_y = train_y.loc[train_x.index]
test_y = test_y.loc[test_x.index]

In [10]:
# predict with pipeline
pipeline_predicted = saved_model.model.predict(test_x)

In [11]:
# drop the columns that declare the group since we use only one group for the test
test_x.drop(list(subset_cols), axis=1, inplace=True)
train_x.drop(list(subset_cols), axis=1, inplace=True)

In [12]:
# transform price collumn
transformer = PolynomialFeatures(2)
transformer.fit(train_x[['promoted_price']])

def transform_data(data):
    transformed_price = transformer.transform(data[['promoted_price']])
    transformed_price = pd.DataFrame(data=transformed_price, index=data.index,
                                         columns=transformer.get_feature_names(data.columns))
    transformed_price.drop(['1', 'promoted_price'], axis=1, inplace=True)
    transformed_x = pd.concat([data, transformed_price], axis=1)
    return transformed_x
train_transformed = transform_data(train_x)
test_transformed = transform_data(test_x)

price_collumns = [col for col in test_transformed if col.startswith('promoted_price')]

In [13]:
# predict market share only using price related data
model = LinearRegression().fit(train_transformed[price_collumns], train_y)

predicted_market_share = model.predict(test_transformed[price_collumns])
predicted_market_share = pd.Series(index=test_transformed.index, data=predicted_market_share)

In [14]:
# predict output
test_x['predicted_market_volume'] = [lookup_dict.get((week, pr), 0) 
                                        for week, pr in [*zip(test_x['yearweek'], test_x['original_product_dimension_44'])]]

directly_predicted = predicted_market_share.mul(
        test_x['predicted_market_volume']).mul(
        test_x['consumer_length']).div(
        test_x['product_volume_per_sku']).clip(lower=0)

In [15]:
pd.DataFrame({'directly_predicted': directly_predicted, 'pipeline_predicted': pipeline_predicted})


Out[15]:
directly_predicted pipeline_predicted
602 0.000000 0.000000
347 0.000000 0.000000
194 0.008533 0.008533
791 -0.000000 -0.000000
795 -0.000000 -0.000000
... ... ...
444 0.000000 0.000000
280 22.812515 22.812515
342 0.000000 0.000000
601 -0.000000 -0.000000
680 0.000000 0.000000

251 rows × 2 columns