Problem Description

We assume that we deal with a problem of e-commerce platform. People come from different hardware profiles (Linux, Mac, Firefox, I.E. etc) and navigate to different categories.

We want to predict the preference of categories based on the user hardware profile

Features dealing with the hardware profile (The independent variables): e.g. platform: mobile, tablet, desktop , unknown ... os: Windows, Windows 98, Bada, Solaris, Firefox OS, Ubuntu, OpenBSD ... browser : Safari, BlackBerry WebKit, Pinterest, NetFront, PhantomJS, Chrome Mobile iOS, Chromium, Opera ...
device : Samsung SM-A300FU, HUAWEI LYO-L21, YD201, HTC One M9_Prime Camera Edit, Samsung SM-N910H

And the dependent variable: e.g. category: electronics, toys, pills


In [1]:
# !!! Relevant reading
# http://blog.yhat.com/posts/logistic-regression-and-python.html
# http://stats.stackexchange.com/questions/224051/one-hot-vs-dummy-encoding-in-scikit-learn
# http://blog.yhat.com/posts/logistic-regression-python-rodeo.html

In [2]:
import pandas as pd
import numpy as np

Generate Data


In [62]:
# we keep only the initial letter of the strings for simplicity

# Independent variables
os_values = ['L', 'M', 'W']
browser_values = ['F', 'I', 'S']
# Dependent Variable
categories = ['E', 'T', 'P']

# The input with hand so to check if our model works 
# The categories for the combinations not found below simply get equal weights
features_categories_probabilites = {
    ('L', 'F') : [0.7, 0.2, 0.1],
    ('M', 'S') : [0.8, 0.1, 0.1],
    ('W', 'I') : [0.1, 0.1, 0.8]
}

def generate_dataset_with_probabilities(n_datapoints,
                                     input_values,
                                     input_probabilites=None):
    """ Return a dataset of given possible values with given possible probabilities
    :n_datapoints: The number of datapoints we want to generate
    :input_values: 1-D array e.g. ['meat', 'fish', 'vegetables']
    :input_probabilites: 1-D array-like e.g. [0.5, 0.25, 0.25]
        The probabilities associated with each entry in entries_values.
        If not given the sample assumes a uniform distribution over all entries_values
    """
    import numpy as np
    datapoints = []
    for i in range(n_datapoints):
        datapoints.append(
            np.random.choice(input_values, 
                             p=input_probabilites))
    return datapoints


def get_features_values_combinations(list_a, list_b):
    """ Returns a list of combinations of the values of 
    e.g. from the lists 
    list_a = ['L', 'M', 'W']
    list_b = ['F', 'I', 'S']
    we get the combinations:
    [('L', 'F'), ('L', 'I'), ('L', 'S'), ('M', 'F'), ('M', 'I') ...
    """
    import itertools
    return list(itertools.product(list_a, list_b))


n_datapoints = 100
datapoints = []
features_combinations = get_features_values_combinations(os_values, browser_values)
n_datapoints_per_combination = int (1. * n_datapoints / len(features_combinations)) 
for feature_comb in features_combinations:
    features_cat_prob = features_categories_probabilites.get(feature_comb, [0.33, 0.33, 1-0.33-0.33])
    for ii in range(n_datapoints_per_combination):
        generated_categories = generate_dataset_with_probabilities(
            n_datapoints_per_combination,
            categories,
            input_probabilites=features_cat_prob)                
        datapoints.extend([[feature_comb[0], feature_comb[1], c] for c in generated_categories])
    

    
# Construct the final dataframe
x_vars = ['os', 'browser']
y_var = 'category'

columns.append(y_var)

df_data = pd.DataFrame(data=datapoints, columns=x_vars + [y_var])
df_data.head(2)


Out[62]:
os browser category
0 L F E
1 L F T

Train Test Split


In [65]:
from sklearn.model_selection import train_test_split
X = df_data[x_vars].values
y = df_data[y_var].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# And convert to DataFrames
df_train_X = pd.DataFrame(data=X_train, columns=x_vars)
df_train_y = pd.DataFrame(data=y_train, columns=[y_var])

df_test_X = pd.DataFrame(data=X_test, columns=x_vars)
df_test_y = pd.DataFrame(data=y_test, columns=[y_var])

df_train = pd.concat([df_train_X, df_train_y], axis=1)
df_test = pd.concat([df_test_X, df_test_y], axis=1)

In [68]:
from sklearn import  linear_model
logistic = linear_model.LogisticRegression()
print('LogisticRegression score: %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-68-941b3d6e0339> in <module>()
      2 logistic = linear_model.LogisticRegression()
      3 print('LogisticRegression score: %f'
----> 4       % logistic.fit(X_train, y_train).score(X_test, y_test))

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/sklearn/linear_model/logistic.pyc in fit(self, X, y, sample_weight)
   1171 
   1172         X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-> 1173                          order="C")
   1174         check_classification_targets(y)
   1175         self.classes_ = np.unique(y)

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    519     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    520                     ensure_2d, allow_nd, ensure_min_samples,
--> 521                     ensure_min_features, warn_on_dtype, estimator)
    522     if multi_output:
    523         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: could not convert string to float: I

Feature Transformation


In [27]:
def feature_tranformation_chain(df, 
                           y_column=None,
                           vectorizer=None, 
                           encoder=None):
    """ Transforms a dataframe to a form accepted by Logistic Regression
    :y_column: The column of the dependent variable
    TODO encoder looks like is not needed as arg, xcheck
    """
    from sklearn.feature_extraction import DictVectorizer
    from sklearn import preprocessing
    import pandas as pd
    import numpy as np
    
    df_transformed = df.copy()
    x_columns = df_transformed.columns.values.tolist() 
    x_columns = [x for x in x_columns if x != y_column]

    df_transformed_X = df_transformed[x_columns]
    # Take vectorize the categorical independent variables
    if not vectorizer:
        vectorizer = DictVectorizer(sparse=False) 
        df_transformed_X = vectorizer.fit_transform(df_transformed_X.to_dict(orient='records'))
    else:
        df_transformed_X = vectorizer.transform(df_transformed_X.to_dict(orient='records'))
    original_columns = x_columns
    transformed_x_columns = vectorizer.feature_names_



    y_col_idx_map = None
    if y_column:
        df_transformed_Y = df_transformed[[y_column]]

        # Encode the categorical dependent variable
        encoder = preprocessing.LabelEncoder()
        encoder.fit(df_transformed_Y[y_column].values)
        encoded_labels = encoder.transform(df_transformed_Y[y_column].values)
        df_transformed_Y[y_column] = encoded_labels
        y_col_idx_map = dict(zip(df_transformed_Y[y_column].values, 
                                       encoder.inverse_transform(df_transformed_Y[y_column].values)))

    

    # TODO check that next step is not needed
    """
    # 2. Concert to numeric the independent variables
    #print df_transformed
    df_transformed_X = df_transformed_X.convert_objects(convert_numeric=True)
    #print df_transformed
    """


    
    
    """ TODO cross check that we do not need the part below
    # 4. Exclude the baseline columns for each categorical variable separately to avoid collinearity
    #e.g.
    #original_columns = ['os', 'browser']
    #transformed_columns = ['browser=F', 'browser=I', 'browser=S', 'category', 'os=L', 'os=M', 'os=W']
    #calculated baseline_columns = ['os=W', 'browser=S']
    #baseline_columns_idx = [6, 2]
    
        
    print transformed_columns

    baseline_columns = []    
    for original_column in original_columns:
        baseline_columns.append([c for c in transformed_columns if original_column + '=' in c][-1])
    baseline_columns_idxs = [transformed_columns.index(i) for i in baseline_columns]
    print baseline_columns_idxs
    df_transformed = np.delete(df_transformed, baseline_columns_idxs, 1) 
    transformed_columns = np.delete(transformed_columns, baseline_columns_idxs).tolist() 
    """

    
    # Concert to DataFrames (and concat with dependent component if exists)
    df_transformed_X = pd.DataFrame(data = df_transformed_X, columns=transformed_x_columns)
    if y_column:
        df_transformed_Y = pd.DataFrame(data = df_transformed_Y, columns=[y_column])
        df_transformed = pd.concat([df_transformed_X, df_transformed_Y], axis=1)
    else:
        df_transformed = pd.DataFrame(data = df_transformed_X, columns=transformed_x_columns)



    return df_transformed, vectorizer, encoder, y_col_idx_map

In [33]:
# Output
df_train_transformed, vectorizer, encoder , y_col_idx_map = feature_tranformation_chain(df_train, 
                                                                                  y_column=y_var)


/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:38: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [31]:
df_transformed.head(2)


Out[31]:
browser=F browser=I browser=S os=L os=M os=W category
0 0.0 1.0 0.0 0.0 1.0 0.0 1
1 0.0 0.0 1.0 0.0 1.0 0.0 0

Model


In [32]:
def get_logistic_regresion_model(df, y_column):
    """
    """
    x_columns = df.columns.values.tolist() 
    x_columns.remove(y_column)
    
    from sklearn import linear_model, datasets
    logreg = linear_model.LogisticRegression(C=1e5)
    logreg.fit(df[x_columns].values, df[y_column].values)
    return logreg

In [34]:
logistic_regresion_model = get_logistic_regresion_model(df_train_transformed, 'category')

Tune Model


In [35]:
#TODO

Predict


In [36]:
df_test_transformed, vectorizer, encoder , _ = feature_tranformation_chain(df_test, y_column=y_var)


/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:38: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [39]:
y_test = df_test_transformed[y_var]

In [49]:
y_predict = logistic_regresion_model.predict(df_test_transformed.drop([y_var],axis=1).values)

In [51]:
df_predicted = pd.DataFrame(data=zip(y_test, y_predict), columns=['real', 'prediction'])

In [57]:
df_predicted['cor'] = df_predicted.apply(lambda x: x['real']==x['prediction'], axis=1)

In [61]:
1.* df_predicted['cor'].sum() / len(df_predicted)


Out[61]:
0.42777777777777776

In [40]:
logistic_regresion_model.predict_proba(df_test_transformed.values[i]).tolist()[0]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-40-240368289bd0> in <module>()
----> 1 logistic_regresion_model.predict_proba(df_test_transformed.values[i]).tolist()[0]

NameError: name 'i' is not defined

In [122]:
df_test = pd.DataFrame(
    data=[
   ['L', 'F'],
   ['L', 'I'],
   ['W', 'S']
   ], 
columns=['os', 'browser'])

# 1. Transform input data
df_test_transformed, vectorizer, encoder, _ = feature_tranformation_chain(df_test,  vectorizer = vectorizer)
df_test_transformed

probs_all = []
for i in range(len(df_test_transformed)):
    probs = logistic_regresion_model.predict_proba(df_test_transformed.values[i]).tolist()[0]
    probs_all.append(probs)
#data = []
#for i in range(9):
#    data.append([y_col_idx_map[i], probs[i]])   
#print "~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~"
#    return pd.DataFrame(data=data, 
#                        columns=['category'+df_suffix, 'probability'+df_suffix])

print probs_all


[[0.448848567463995, 0.25489236990701236, 0.29625906262899265], [0.2813156019776315, 0.4494225116631866, 0.2692618863591819], [0.35131137131936746, 0.37016849583776545, 0.27852013284286703]]

In [117]:
#y_col_idx_map

In [118]:
# remove colums generated by the transformation
# TODO next drop with hand make it more robust
#df_test_transformed = df_test_transformed.drop('category', axis=1)

In [119]:
logistic_regresion_model.predict_proba(df_test_transformed.values[0])


Out[119]:
array([[ 0.44884857,  0.25489237,  0.29625906]])

In [ ]:
logistic_regresion_model.predict_proba(df_test_transformed.values[0])