Case of Logistic regression with both independent and dependent variables: Categorical
In [331]:
# imports etc
import pandas as pd
%matplotlib inline
In [126]:
# !!! Relevant reading
# http://blog.yhat.com/posts/logistic-regression-and-python.html
# http://stats.stackexchange.com/questions/224051/one-hot-vs-dummy-encoding-in-scikit-learn
# http://blog.yhat.com/posts/logistic-regression-python-rodeo.html
In [392]:
#os = Linux, Mac, Windows
#browser = Firefox, Ie, Safari
#category = Electronics, Toys, Medicines
df_data = pd.DataFrame(
data=[
['L', 'F', 'E'],
['L', 'I', 'E'],
['L', 'S', 'E'],
['L', 'S', 'T'],
['M', 'F', 'E'],
['M', 'I', 'E'],
['M', 'S', 'E'],
['M', 'S', 'T'],
['W', 'F', 'M'],
['W', 'I', 'M'],
['W', 'S', 'M'],
['W', 'S', 'T'],
],
columns=['os', 'browser', 'category'])
VAR_DEPENDENT = ['os', 'browser']
VAR_DEPENDENT_CATEGORICAL = ['os', 'browser']
VAR_INDEPENDENT = 'category'
In [393]:
df_data.head(2)
Out[393]:
In [394]:
df = df_data.copy()
In [395]:
def get_vectorized_df(df_input, columns):
""" TODO
"""
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
df = df_input.copy()
df = df[columns].convert_objects(convert_numeric=True)
vectorizer = DictVectorizer(sparse=False)
df_vec = vectorizer.fit_transform(df.to_dict(orient='records'))
# dv.feature_names_ holds correspondence to the columns:
return (vectorizer, pd.DataFrame(data=df_vec,
columns=vectorizer.feature_names_))
def get_label_encoded_column_df(df_input, column):
"""
# Transform independent variable for regression input
"""
from sklearn import preprocessing
df = df_input.copy()
encoder = preprocessing.LabelEncoder()
encoder.fit(df[column].values)
encoded_labels = encoder.transform(df[column].values)
return (encoder, pd.DataFrame(data=encoded_labels,
columns=[column]))
In [396]:
# We label-encode the dependent variable and vectorize the independent variables
encoder, df_Y = get_label_encoded_column_df(df, Y_COL)
vectorizer, df_X = get_vectorized_df(df, X_COLS)
In [397]:
# We exclude baseline columns
# This has to be done for each categorical variable separately
# This is done to prevent multicollinearity, or the dummy variable trap caused by including a dummy variable for every single category.
# here we do it 'manually', the last column of each group
df_X = df_X.drop(['browser=S', 'os=W'], axis=1)
In [398]:
df_X.head(2)
Out[398]:
In [399]:
df_Y.head(2)
Out[399]:
In [400]:
dependent_var_map = vectorizer.inverse_transform(df_X.values)
independent_var_map = dict(zip(df_Y[Y_COL].values, encoder.inverse_transform(df_Y[Y_COL].values)))
print dependent_var_map
print independent_var_map
In [401]:
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(C=1e5)
# we create an instance of Neighbours Classifier and fit the data.
logreg.fit(df_X.values, df_Y.values)
Out[401]:
In [402]:
df_test = pd.DataFrame(
data=[
['L', 'F'],
['M', 'S'],
['W', 'F']
],
columns=['os', 'browser'])
df_test
Out[402]:
In [391]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = pd.DataFrame(data=[['A', 'Y'],
['A', 'Y'],
['B', 'Z']], columns=['vara', 'varb'])
print D
X = v.fit_transform(D.to_dict(orient='records'))
print v.feature_names_
print X
D2 = pd.DataFrame(data=[['A', 'Y'],
['A', 'Z'],
['B', 'X']], columns=['vara', 'varb'])
print v.transform(D2.to_dict(orient='records'))
v2 = DictVectorizer(sparse=False)
print v2.fit_transform(D2.to_dict(orient='records'))
In [ ]:
In [378]:
print df_test.to_dict(orient='records')
vectorizer_inverse = vectorizer.inverse_transform(df_X.values)
print vectorizer_inverse
#df_test_vec = vectorizer.fit_transform(df_test.to_dict(orient='records'))
In [370]:
# Predict class labels for samples in X.
logreg.predict(df_test_vec)
In [351]:
# Probability estimates. The returned estimates for all classes are ordered by the label of classes.
logreg.predict_proba([1.0, 0.0, 1.0, 0.0])
Out[351]:
In [ ]:
# from the predict_proba we see that it is so much more likely that the result belongs to category 0
# which is what we get from the .predict method