In [2]:

    
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
from copy import deepcopy
from numpy.random import randint
import random
import itertools 
from operator import itemgetter

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from vf_portalytics.feature_subset import FeatureSubsetModel, FeatureSubsetTransform
from vf_portalytics.model import PredictionModel



In [2]:

    
def make_dataset(n_samples, n_features, n_informative, **kwargs):
    x, y = make_regression(
        n_samples=n_samples, 
        n_features=n_features,
        noise=0.5,
        n_informative=n_informative, 
        random_state=0
    )
    x = pd.DataFrame(x)
    
    x.columns = ['feature_' + str(i) for i in range(n_features)]
    x = x.assign(**kwargs)
    return x, pd.Series(y, name='target')


# Generate data for 4 different categories
# different #samples for each category but the same #features since they belong to the same dataset
n_features = 20
x1, y1 = make_dataset(n_samples=100, n_features=n_features, n_informative=10, category='A')
x2, y2 = make_dataset(n_samples=150, n_features=n_features, n_informative=8, category='B')
x3, y3 = make_dataset(n_samples=80, n_features=n_features, n_informative=7, category='C')
x4, y4 = make_dataset(n_samples=120, n_features=n_features, n_informative=12, category='D')

# combine into one dataset
total_x = pd.concat([x1, x2, x3, x4], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3, y4], axis=0, ignore_index=True).reset_index(drop=True)

# make two random features categorical
labels = ['g1', 'g2', 'g3']
bins = [[],[]]
for i in range(2):
    bins[i] = [-np.inf, 
               total_x['feature_' + str(i)].mean() - total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].mean() + total_x['feature_' + str(i)].std(), 
               total_x['feature_' + str(i)].max()]
total_x['feature_0'] = pd.cut(total_x['feature_0'], bins=bins[0], labels=labels).astype('object')
total_x['feature_1'] = pd.cut(total_x['feature_1'], bins=bins[1], labels=labels).astype('object')

Generate data and lookup dictionary



In [3]:

    
collumn_names = ['promoted_price', 'consumer_length', 
                 'yearweek',  'original_product_dimension_44', 'product_volume_per_sku']

x1, y1 = make_dataset(1, 5, collumn_names, account_banner='A', product_desc='X')
x2, y2 = make_dataset(2, 3, collumn_names, account_banner='B', product_desc='Y')
# create on more that will not have sub_model and will predict 0
x3, y3 = make_dataset(3, 1, collumn_names, account_banner='C', product_desc='Z')

# combine into one dataset
total_x = pd.concat([x1, x2, x3], axis=0, ignore_index=True).reset_index(drop=True)
total_y = pd.concat([y1, y2, y3], axis=0, ignore_index=True).reset_index(drop=True)
# Split into train and test
train_index, test_index = train_test_split(total_x.index, random_state=5)
train_x, train_y = total_x.loc[train_index, :], total_y.loc[train_index]
test_x, test_y = total_x.loc[test_index, :], total_y.loc[test_index]

# create dictionary "predicted_market_volumes" - "lookup_dict"
lookup_dict = make_dict()



In [4]:

    
train_x









    Out[4]:







  
    
      
      promoted_price
      consumer_length
      yearweek
      original_product_dimension_44
      product_volume_per_sku
      account_banner
      product_desc
    
  
  
    
      2528
      1.240198
      -0.588810
      46
      0
      -0.258654
      C
      Z
    
    
      2828
      -1.721130
      -0.633242
      18
      1
      -0.251092
      C
      Z
    
    
      2137
      -2.306269
      1.947770
      33
      0
      0.895523
      C
      Z
    
    
      2637
      -0.303963
      -0.400043
      50
      0
      -0.559406
      C
      Z
    
    
      135
      1.529248
      0.686483
      26
      0
      -0.081570
      A
      X
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2121
      -0.504628
      0.043220
      43
      1
      -1.403318
      C
      Z
    
    
      1424
      -0.722067
      0.466792
      25
      1
      -3.326870
      B
      Y
    
    
      1725
      -0.389445
      -0.658218
      48
      1
      1.398478
      B
      Y
    
    
      2254
      0.350929
      -0.856347
      53
      0
      -0.523437
      C
      Z
    
    
      2915
      -1.847872
      0.576796
      25
      1
      -2.980150
      C
      Z
    
  

2250 rows × 7 columns



In [5]:

    
lookup_dict









    Out[5]:





{(1, 0): 0.6663326653306614,
 (2, 0): -2.0691382765531063,
 (3, 0): -0.9168336673346693,
 (4, 0): -1.8386773547094188,
 (5, 0): 0.4458917835671343,
 (6, 0): -0.9869739478957917,
 (7, 0): 2.409819639278557,
 (8, 0): 0.7965931863727453,
 (9, 0): 0.8967935871743484,
 (10, 0): -1.1472945891783568,
 (11, 0): -1.4579158316633267,
 (12, 0): -0.3356713426853708,
 (13, 0): 1.227454909819639,
 (14, 0): 2.4398797595190382,
 (15, 0): -0.015030060120240663,
 (16, 0): -2.009018036072144,
 (17, 0): -0.746492985971944,
 (18, 0): -2.0691382765531063,
 (19, 0): -1.3276553106212425,
 (20, 0): -1.3376753507014028,
 (21, 0): 1.1573146292585168,
 (22, 0): -1.3176352705410823,
 (23, 0): -1.1472945891783568,
 (24, 0): 1.0671342685370742,
 (25, 0): -2.3597194388777556,
 (26, 0): 1.8787575150300597,
 (27, 0): 0.39579158316633256,
 (28, 0): 1.3677354709418834,
 (29, 0): -1.6683366733466936,
 (30, 0): 0.7765531062124249,
 (31, 0): 0.6663326653306614,
 (32, 0): 0.405811623246493,
 (33, 0): 1.9789579158316633,
 (34, 0): 2.0490981963927855,
 (35, 0): -1.9188376753507015,
 (36, 0): -0.5360721442885772,
 (37, 0): 1.1573146292585168,
 (38, 0): -2.299599198396794,
 (39, 0): 1.4979959919839678,
 (40, 0): 0.06513026052104198,
 (41, 0): 1.0871743486973946,
 (42, 0): -1.1272545090180361,
 (43, 0): 2.0290581162324646,
 (44, 0): -1.3977955911823647,
 (45, 0): 1.1372745490981964,
 (46, 0): -1.9589178356713428,
 (47, 0): 0.39579158316633256,
 (48, 0): 1.0270541082164328,
 (49, 0): 1.8987975951903806,
 (50, 0): 1.447895791583166,
 (51, 0): 0.35571142284569124,
 (52, 0): -2.299599198396794,
 (53, 0): -0.5861723446893787}

Create pipeline



In [6]:

    
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures


subset_cols = ('account_banner', 'product_desc')
sub_models = {
    ('A', 'X'): LinearRegression(),
    ('B', 'Y'): DecisionTreeRegressor(),
}


pipeline = Pipeline([  
  ('transform', FeatureSubsetTransform(group_cols=subset_cols, transformer=PolynomialFeatures(2))),
  ('estimate', FeatureSubsetModel(lookup_dict=lookup_dict, group_cols=subset_cols, sub_models=sub_models))
])

Create VF Model Wrapper and Save pipeline



In [7]:

    
# Note: must use one_hot_encode=False to prevent one-hot encoding of categorical features in input data
model_wrapper = PredictionModel("my_test_model", path='/tmp', one_hot_encode=False)

model_wrapper.model = pipeline
# save feature names (no strictly since all the preprocessing is made being made in the pipeline)
model_wrapper.features = {
    # Grouping features
    'account_banner': [],
    'product_desc': [],
    # other feaures
    'promoted_price': [],
    'consumer_length': [],
    'yearweek': [],
    'original_product_dimension_44': [],
    'product_volume_per_sku': [],
}
model_wrapper.target = {'target': []}
model_wrapper.ordered_column_list = sorted(model_wrapper.features.keys())

model_wrapper.model.fit(train_x, train_y)

model_wrapper.save()









    



/home/ctselas/anaconda3/lib/python3.7/site-packages/sklearn/dummy.py:132: FutureWarning: The default value of strategy will change from stratified to prior in 0.24.
  "stratified to prior in 0.24.", FutureWarning)

Load Pre-Saved Model



In [8]:

    
# Don't specify one_hot_encode here because it will be looked up from the pickle file
saved_model = PredictionModel('my_test_model', path='/tmp')
saved_model.model









    Out[8]:





Pipeline(memory=None,
         steps=[('transform',
                 FeatureSubsetTransform(group_cols=('account_banner',
                                                    'product_desc'),
                                        transformer=PolynomialFeatures(degree=2,
                                                                       include_bias=True,
                                                                       interaction_only=False,
                                                                       order='C'))),
                ('estimate',
                 FeatureSubsetModel(group_cols=('account_banner',
                                                'product_desc'),
                                    lookup_dict={(1, 0): 0.6663326653306614,
                                                 (2, 0): -2.0691382765531063,
                                                 (3, 0): -...
                                                ('B', 'Y'): DecisionTreeRegressor(ccp_alpha=0.0,
                                                                                  criterion='mse',
                                                                                  max_depth=None,
                                                                                  max_features=None,
                                                                                  max_leaf_nodes=None,
                                                                                  min_impurity_decrease=0.0,
                                                                                  min_impurity_split=None,
                                                                                  min_samples_leaf=1,
                                                                                  min_samples_split=2,
                                                                                  min_weight_fraction_leaf=0.0,
                                                                                  presort='deprecated',
                                                                                  random_state=None,
                                                                                  splitter='best'),
                                                ('C', 'Z'): DummyClassifier(constant=0,
                                                                            random_state=None,
                                                                            strategy='warn')}))],
         verbose=False)

Test the results



In [9]:

    
# test for the first group if the pipeline performs what we would like to
groups = train_x.groupby(by=list(subset_cols))
_, train_x = list(groups)[0]

groups = test_x.groupby(by=list(subset_cols))
_, test_x = list(groups)[0]

train_y = train_y.loc[train_x.index]
test_y = test_y.loc[test_x.index]



In [10]:

    
# predict with pipeline
pipeline_predicted = saved_model.model.predict(test_x)



In [11]:

    
# drop the columns that declare the group since we use only one group for the test
test_x.drop(list(subset_cols), axis=1, inplace=True)
train_x.drop(list(subset_cols), axis=1, inplace=True)



In [12]:

    
# transform price collumn
transformer = PolynomialFeatures(2)
transformer.fit(train_x[['promoted_price']])

def transform_data(data):
    transformed_price = transformer.transform(data[['promoted_price']])
    transformed_price = pd.DataFrame(data=transformed_price, index=data.index,
                                         columns=transformer.get_feature_names(data.columns))
    transformed_price.drop(['1', 'promoted_price'], axis=1, inplace=True)
    transformed_x = pd.concat([data, transformed_price], axis=1)
    return transformed_x
train_transformed = transform_data(train_x)
test_transformed = transform_data(test_x)

price_collumns = [col for col in test_transformed if col.startswith('promoted_price')]



In [13]:

    
# predict market share only using price related data
model = LinearRegression().fit(train_transformed[price_collumns], train_y)

predicted_market_share = model.predict(test_transformed[price_collumns])
predicted_market_share = pd.Series(index=test_transformed.index, data=predicted_market_share)



In [14]:

    
# predict output
test_x['predicted_market_volume'] = [lookup_dict.get((week, pr), 0) 
                                        for week, pr in [*zip(test_x['yearweek'], test_x['original_product_dimension_44'])]]

directly_predicted = predicted_market_share.mul(
        test_x['predicted_market_volume']).mul(
        test_x['consumer_length']).div(
        test_x['product_volume_per_sku']).clip(lower=0)



In [15]:

    
pd.DataFrame({'directly_predicted': directly_predicted, 'pipeline_predicted': pipeline_predicted})









    Out[15]:







  
    
      
      directly_predicted
      pipeline_predicted
    
  
  
    
      602
      0.000000
      0.000000
    
    
      347
      0.000000
      0.000000
    
    
      194
      0.008533
      0.008533
    
    
      791
      -0.000000
      -0.000000
    
    
      795
      -0.000000
      -0.000000
    
    
      ...
      ...
      ...
    
    
      444
      0.000000
      0.000000
    
    
      280
      22.812515
      22.812515
    
    
      342
      0.000000
      0.000000
    
    
      601
      -0.000000
      -0.000000
    
    
      680
      0.000000
      0.000000
    
  

251 rows × 2 columns

	promoted_price	consumer_length	yearweek	original_product_dimension_44	product_volume_per_sku	account_banner	product_desc
2528	1.240198	-0.588810	46	0	-0.258654	C	Z
2828	-1.721130	-0.633242	18	1	-0.251092	C	Z
2137	-2.306269	1.947770	33	0	0.895523	C	Z
2637	-0.303963	-0.400043	50	0	-0.559406	C	Z
135	1.529248	0.686483	26	0	-0.081570	A	X
...	...	...	...	...	...	...	...
2121	-0.504628	0.043220	43	1	-1.403318	C	Z
1424	-0.722067	0.466792	25	1	-3.326870	B	Y
1725	-0.389445	-0.658218	48	1	1.398478	B	Y
2254	0.350929	-0.856347	53	0	-0.523437	C	Z
2915	-1.847872	0.576796	25	1	-2.980150	C	Z

	directly_predicted	pipeline_predicted
602	0.000000	0.000000
347	0.000000	0.000000
194	0.008533	0.008533
791	-0.000000	-0.000000
795	-0.000000	-0.000000
...	...	...
444	0.000000	0.000000
280	22.812515	22.812515
342	0.000000	0.000000
601	-0.000000	-0.000000
680	0.000000	0.000000