Problem Description
We assume that we deal with a problem of e-commerce platform. People come from different hardware profiles (Linux, Mac, Firefox, I.E. etc) and navigate to different categories.
We want to predict the preference of categories based on the user hardware profile
Features dealing with the hardware profile (The independent variables):
e.g.
platform: mobile, tablet, desktop , unknown ...
os: Windows, Windows 98, Bada, Solaris, Firefox OS, Ubuntu, OpenBSD ...
browser : Safari, BlackBerry WebKit, Pinterest, NetFront, PhantomJS, Chrome Mobile iOS, Chromium, Opera ...
device : Samsung SM-A300FU, HUAWEI LYO-L21, YD201, HTC One M9_Prime Camera Edit, Samsung SM-N910H
And the dependent variable: e.g. category: electronics, toys, pills
In [1]:
# !!! Relevant reading
# http://blog.yhat.com/posts/logistic-regression-and-python.html
# http://stats.stackexchange.com/questions/224051/one-hot-vs-dummy-encoding-in-scikit-learn
# http://blog.yhat.com/posts/logistic-regression-python-rodeo.html
In [2]:
import pandas as pd
import numpy as np
In [62]:
# we keep only the initial letter of the strings for simplicity
# Independent variables
os_values = ['L', 'M', 'W']
browser_values = ['F', 'I', 'S']
# Dependent Variable
categories = ['E', 'T', 'P']
# The input with hand so to check if our model works
# The categories for the combinations not found below simply get equal weights
features_categories_probabilites = {
('L', 'F') : [0.7, 0.2, 0.1],
('M', 'S') : [0.8, 0.1, 0.1],
('W', 'I') : [0.1, 0.1, 0.8]
}
def generate_dataset_with_probabilities(n_datapoints,
input_values,
input_probabilites=None):
""" Return a dataset of given possible values with given possible probabilities
:n_datapoints: The number of datapoints we want to generate
:input_values: 1-D array e.g. ['meat', 'fish', 'vegetables']
:input_probabilites: 1-D array-like e.g. [0.5, 0.25, 0.25]
The probabilities associated with each entry in entries_values.
If not given the sample assumes a uniform distribution over all entries_values
"""
import numpy as np
datapoints = []
for i in range(n_datapoints):
datapoints.append(
np.random.choice(input_values,
p=input_probabilites))
return datapoints
def get_features_values_combinations(list_a, list_b):
""" Returns a list of combinations of the values of
e.g. from the lists
list_a = ['L', 'M', 'W']
list_b = ['F', 'I', 'S']
we get the combinations:
[('L', 'F'), ('L', 'I'), ('L', 'S'), ('M', 'F'), ('M', 'I') ...
"""
import itertools
return list(itertools.product(list_a, list_b))
n_datapoints = 100
datapoints = []
features_combinations = get_features_values_combinations(os_values, browser_values)
n_datapoints_per_combination = int (1. * n_datapoints / len(features_combinations))
for feature_comb in features_combinations:
features_cat_prob = features_categories_probabilites.get(feature_comb, [0.33, 0.33, 1-0.33-0.33])
for ii in range(n_datapoints_per_combination):
generated_categories = generate_dataset_with_probabilities(
n_datapoints_per_combination,
categories,
input_probabilites=features_cat_prob)
datapoints.extend([[feature_comb[0], feature_comb[1], c] for c in generated_categories])
# Construct the final dataframe
x_vars = ['os', 'browser']
y_var = 'category'
columns.append(y_var)
df_data = pd.DataFrame(data=datapoints, columns=x_vars + [y_var])
df_data.head(2)
Out[62]:
In [65]:
from sklearn.model_selection import train_test_split
X = df_data[x_vars].values
y = df_data[y_var].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# And convert to DataFrames
df_train_X = pd.DataFrame(data=X_train, columns=x_vars)
df_train_y = pd.DataFrame(data=y_train, columns=[y_var])
df_test_X = pd.DataFrame(data=X_test, columns=x_vars)
df_test_y = pd.DataFrame(data=y_test, columns=[y_var])
df_train = pd.concat([df_train_X, df_train_y], axis=1)
df_test = pd.concat([df_test_X, df_test_y], axis=1)
In [68]:
from sklearn import linear_model
logistic = linear_model.LogisticRegression()
print('LogisticRegression score: %f'
% logistic.fit(X_train, y_train).score(X_test, y_test))
In [27]:
def feature_tranformation_chain(df,
y_column=None,
vectorizer=None,
encoder=None):
""" Transforms a dataframe to a form accepted by Logistic Regression
:y_column: The column of the dependent variable
TODO encoder looks like is not needed as arg, xcheck
"""
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
import pandas as pd
import numpy as np
df_transformed = df.copy()
x_columns = df_transformed.columns.values.tolist()
x_columns = [x for x in x_columns if x != y_column]
df_transformed_X = df_transformed[x_columns]
# Take vectorize the categorical independent variables
if not vectorizer:
vectorizer = DictVectorizer(sparse=False)
df_transformed_X = vectorizer.fit_transform(df_transformed_X.to_dict(orient='records'))
else:
df_transformed_X = vectorizer.transform(df_transformed_X.to_dict(orient='records'))
original_columns = x_columns
transformed_x_columns = vectorizer.feature_names_
y_col_idx_map = None
if y_column:
df_transformed_Y = df_transformed[[y_column]]
# Encode the categorical dependent variable
encoder = preprocessing.LabelEncoder()
encoder.fit(df_transformed_Y[y_column].values)
encoded_labels = encoder.transform(df_transformed_Y[y_column].values)
df_transformed_Y[y_column] = encoded_labels
y_col_idx_map = dict(zip(df_transformed_Y[y_column].values,
encoder.inverse_transform(df_transformed_Y[y_column].values)))
# TODO check that next step is not needed
"""
# 2. Concert to numeric the independent variables
#print df_transformed
df_transformed_X = df_transformed_X.convert_objects(convert_numeric=True)
#print df_transformed
"""
""" TODO cross check that we do not need the part below
# 4. Exclude the baseline columns for each categorical variable separately to avoid collinearity
#e.g.
#original_columns = ['os', 'browser']
#transformed_columns = ['browser=F', 'browser=I', 'browser=S', 'category', 'os=L', 'os=M', 'os=W']
#calculated baseline_columns = ['os=W', 'browser=S']
#baseline_columns_idx = [6, 2]
print transformed_columns
baseline_columns = []
for original_column in original_columns:
baseline_columns.append([c for c in transformed_columns if original_column + '=' in c][-1])
baseline_columns_idxs = [transformed_columns.index(i) for i in baseline_columns]
print baseline_columns_idxs
df_transformed = np.delete(df_transformed, baseline_columns_idxs, 1)
transformed_columns = np.delete(transformed_columns, baseline_columns_idxs).tolist()
"""
# Concert to DataFrames (and concat with dependent component if exists)
df_transformed_X = pd.DataFrame(data = df_transformed_X, columns=transformed_x_columns)
if y_column:
df_transformed_Y = pd.DataFrame(data = df_transformed_Y, columns=[y_column])
df_transformed = pd.concat([df_transformed_X, df_transformed_Y], axis=1)
else:
df_transformed = pd.DataFrame(data = df_transformed_X, columns=transformed_x_columns)
return df_transformed, vectorizer, encoder, y_col_idx_map
In [33]:
# Output
df_train_transformed, vectorizer, encoder , y_col_idx_map = feature_tranformation_chain(df_train,
y_column=y_var)
In [31]:
df_transformed.head(2)
Out[31]:
In [32]:
def get_logistic_regresion_model(df, y_column):
"""
"""
x_columns = df.columns.values.tolist()
x_columns.remove(y_column)
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(df[x_columns].values, df[y_column].values)
return logreg
In [34]:
logistic_regresion_model = get_logistic_regresion_model(df_train_transformed, 'category')
In [35]:
#TODO
In [36]:
df_test_transformed, vectorizer, encoder , _ = feature_tranformation_chain(df_test, y_column=y_var)
In [39]:
y_test = df_test_transformed[y_var]
In [49]:
y_predict = logistic_regresion_model.predict(df_test_transformed.drop([y_var],axis=1).values)
In [51]:
df_predicted = pd.DataFrame(data=zip(y_test, y_predict), columns=['real', 'prediction'])
In [57]:
df_predicted['cor'] = df_predicted.apply(lambda x: x['real']==x['prediction'], axis=1)
In [61]:
1.* df_predicted['cor'].sum() / len(df_predicted)
Out[61]:
In [40]:
logistic_regresion_model.predict_proba(df_test_transformed.values[i]).tolist()[0]
In [122]:
df_test = pd.DataFrame(
data=[
['L', 'F'],
['L', 'I'],
['W', 'S']
],
columns=['os', 'browser'])
# 1. Transform input data
df_test_transformed, vectorizer, encoder, _ = feature_tranformation_chain(df_test, vectorizer = vectorizer)
df_test_transformed
probs_all = []
for i in range(len(df_test_transformed)):
probs = logistic_regresion_model.predict_proba(df_test_transformed.values[i]).tolist()[0]
probs_all.append(probs)
#data = []
#for i in range(9):
# data.append([y_col_idx_map[i], probs[i]])
#print "~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~"
# return pd.DataFrame(data=data,
# columns=['category'+df_suffix, 'probability'+df_suffix])
print probs_all
In [117]:
#y_col_idx_map
In [118]:
# remove colums generated by the transformation
# TODO next drop with hand make it more robust
#df_test_transformed = df_test_transformed.drop('category', axis=1)
In [119]:
logistic_regresion_model.predict_proba(df_test_transformed.values[0])
Out[119]:
In [ ]:
logistic_regresion_model.predict_proba(df_test_transformed.values[0])