Case of Logistic regression with both independent and dependent variables: Categorical


In [331]:
# imports etc
import pandas as pd
%matplotlib inline

In [126]:
# !!! Relevant reading
# http://blog.yhat.com/posts/logistic-regression-and-python.html
# http://stats.stackexchange.com/questions/224051/one-hot-vs-dummy-encoding-in-scikit-learn
# http://blog.yhat.com/posts/logistic-regression-python-rodeo.html

Input data


In [392]:
#os = Linux, Mac, Windows
#browser = Firefox, Ie, Safari
#category = Electronics, Toys, Medicines

df_data = pd.DataFrame(
    data=[
   ['L', 'F', 'E'],
   ['L', 'I', 'E'],
   ['L', 'S', 'E'],
   ['L', 'S', 'T'],
   ['M', 'F', 'E'],
   ['M', 'I', 'E'],
   ['M', 'S', 'E'],
   ['M', 'S', 'T'],
   ['W', 'F', 'M'],
   ['W', 'I', 'M'],
   ['W', 'S', 'M'],
   ['W', 'S', 'T'],
   ], 
columns=['os', 'browser', 'category'])


VAR_DEPENDENT = ['os', 'browser']
VAR_DEPENDENT_CATEGORICAL = ['os', 'browser']
VAR_INDEPENDENT = 'category'

In [393]:
df_data.head(2)


Out[393]:
os browser category
0 L F E
1 L I E

Feature Transformation


In [394]:
df = df_data.copy()

In [395]:
def get_vectorized_df(df_input, columns):
    """ TODO
    """
    from sklearn.feature_extraction import DictVectorizer
    import pandas as pd
    df = df_input.copy()
    df = df[columns].convert_objects(convert_numeric=True)
    vectorizer = DictVectorizer(sparse=False) 
    df_vec = vectorizer.fit_transform(df.to_dict(orient='records'))
    # dv.feature_names_ holds correspondence to the columns:
    return (vectorizer, pd.DataFrame(data=df_vec, 
                        columns=vectorizer.feature_names_))
    
def get_label_encoded_column_df(df_input, column):
    """        
    # Transform independent variable for regression input
    """
    from sklearn import preprocessing
    df = df_input.copy()
    encoder = preprocessing.LabelEncoder()
    encoder.fit(df[column].values)
    encoded_labels = encoder.transform(df[column].values)
    return (encoder, pd.DataFrame(data=encoded_labels, 
                        columns=[column]))

In [396]:
# We label-encode the dependent variable and vectorize the independent variables
encoder, df_Y = get_label_encoded_column_df(df, Y_COL)
vectorizer, df_X = get_vectorized_df(df, X_COLS)


/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.

In [397]:
# We exclude baseline columns
# This has to be done for each categorical variable separately
# This is done to prevent multicollinearity, or the dummy variable trap caused by including a dummy variable for every single category.

# here we do it 'manually', the last column of each group
df_X = df_X.drop(['browser=S', 'os=W'], axis=1)

In [398]:
df_X.head(2)


Out[398]:
browser=F browser=I os=L os=M
0 1.0 0.0 1.0 0.0
1 0.0 1.0 1.0 0.0

In [399]:
df_Y.head(2)


Out[399]:
category
0 0
1 0

In [400]:
dependent_var_map = vectorizer.inverse_transform(df_X.values)
independent_var_map = dict(zip(df_Y[Y_COL].values, encoder.inverse_transform(df_Y[Y_COL].values)))
print dependent_var_map
print independent_var_map


[{'browser=F': 1.0, 'browser=S': 1.0}, {'browser=S': 1.0, 'browser=I': 1.0}, {'browser=S': 1.0}, {'browser=S': 1.0}, {'browser=F': 1.0, 'os=L': 1.0}, {'os=L': 1.0, 'browser=I': 1.0}, {'os=L': 1.0}, {'os=L': 1.0}, {'browser=F': 1.0}, {'browser=I': 1.0}, {}, {}]
{0: 'E', 1: 'M', 2: 'T'}

Model Fit


In [401]:
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(C=1e5)
# we create an instance of Neighbours Classifier and fit the data.
logreg.fit(df_X.values, df_Y.values)


/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:449: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[401]:
LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)

Prediction


In [402]:
df_test = pd.DataFrame(
    data=[
   ['L', 'F'],
   ['M', 'S'],
   ['W', 'F']
   ], 
columns=['os', 'browser'])
df_test


Out[402]:
os browser
0 L F
1 M S
2 W F

In [391]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = pd.DataFrame(data=[['A', 'Y'],
                       ['A', 'Y'],
                       ['B', 'Z']], columns=['vara', 'varb'])
print D
X = v.fit_transform(D.to_dict(orient='records'))
print v.feature_names_
print X
D2 = pd.DataFrame(data=[['A', 'Y'],
                        ['A', 'Z'],
                        ['B', 'X']], columns=['vara', 'varb'])
 
print v.transform(D2.to_dict(orient='records'))
v2 = DictVectorizer(sparse=False)
print v2.fit_transform(D2.to_dict(orient='records'))


  vara varb
0    A    Y
1    A    Y
2    B    Z
['vara=A', 'vara=B', 'varb=Y', 'varb=Z']
[[ 1.  0.  1.  0.]
 [ 1.  0.  1.  0.]
 [ 0.  1.  0.  1.]]
[[ 1.  0.  1.  0.]
 [ 1.  0.  0.  1.]
 [ 0.  1.  0.  0.]]
[[ 1.  0.  0.  1.  0.]
 [ 1.  0.  0.  0.  1.]
 [ 0.  1.  1.  0.  0.]]

In [ ]:


In [378]:
print df_test.to_dict(orient='records')
vectorizer_inverse = vectorizer.inverse_transform(df_X.values)
print vectorizer_inverse

#df_test_vec = vectorizer.fit_transform(df_test.to_dict(orient='records'))


[{'os': 'L', 'browser': 'F'}, {'os': 'M', 'browser': 'S'}, {'os': 'W', 'browser': 'F'}]
[{'browser=F': 1.0, 'os=L': 1.0}, {'browser=S': 1.0, 'os=L': 1.0}, {'os=L': 1.0}, {'os=L': 1.0}, {'browser=F': 1.0, 'os=M': 1.0}, {'os=M': 1.0, 'browser=S': 1.0}, {'os=M': 1.0}, {'os=M': 1.0}, {'browser=F': 1.0}, {'browser=S': 1.0}, {}, {}]

In [370]:
# Predict class labels for samples in X.
logreg.predict(df_test_vec)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-370-1cbb40240b2c> in <module>()
      1 # Predict class labels for samples in X.
----> 2 logreg.predict(df_test_vec)

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/sklearn/linear_model/base.pyc in predict(self, X)
    221             Predicted class label per sample.
    222         """
--> 223         scores = self.decision_function(X)
    224         if len(scores.shape) == 1:
    225             indices = (scores > 0).astype(np.int)

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/sklearn/linear_model/base.pyc in decision_function(self, X)
    202         if X.shape[1] != n_features:
    203             raise ValueError("X has %d features per sample; expecting %d"
--> 204                              % (X.shape[1], n_features))
    205 
    206         scores = safe_sparse_dot(X, self.coef_.T,

ValueError: X has 5 features per sample; expecting 4

In [351]:
# Probability estimates. The returned estimates for all classes are ordered by the label of classes.
logreg.predict_proba([1.0, 0.0, 1.0, 0.0])


Out[351]:
array([[  9.99864245e-01,   9.75917308e-05,   3.81630360e-05]])

In [ ]:
# from the predict_proba we see that it is so much more likely that the result belongs to category 0
# which is what we get from the .predict method