In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

OneHotEncoding for categorical data

Users = age, location

age is float
location in ['Paris', 'Tokyo', 'New York']

In [ ]:
X = np.array([[15.9, 1], # from Tokyo
              [21.5, 2], # from New York
              [31.3, 0], # from Paris
              [25.1, 2], # from New York
              [63.6, 1], # from Tokyo
              [14.4, 1], # from Tokyo
              ])

y = np.array([0, 1, 1, 1, 0, 0])

In [ ]:
# Don't do this!
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100).fit(X, y)
lr.score(X, y)

In [ ]:
lr.coef_

In [ ]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categorical_features=[1], sparse=False).fit(X)
X_one_hot = encoder.transform(X)
X_one_hot

In [ ]:
lr = LogisticRegression().fit(X_one_hot, y)
lr.score(X_one_hot, y)
Users = age, location

age is float
likes puppies in ['yes', 'no']
location in ['Paris', 'Tokyo', 'New York']

In [ ]:
X = np.array([[15.9, 1, 1], # likes puppies from Tokyo
              [21.5, 0, 2], # doesn't like puppies from New York
              [31.3, 0, 0], # doesn't like puppies from Paris
              [25.1, 1, 2], # likes puppies from New York
              [63.6, 0, 1], 
              [14.4, 1, 1], 
              ])

In [ ]:
OneHotEncoder(categorical_features=[1, 2], sparse=False).fit(X).transform(X)

In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test = train_test_split(X, random_state=4)

print("X_train:\n%s" % X_train)
print("\nX_test:\n%s" % X_test)

In [ ]:
encoder = OneHotEncoder(categorical_features=[1, 2], sparse=False).fit(X_train)
encoder.transform(X_test)

In [ ]:
# BAD
OneHotEncoder(categorical_features=[1, 2], sparse=False).fit_transform(X_test)

In [ ]:
X_train, X_test = train_test_split(X, random_state=1)

print("X_train:\n%s" % X_train)
print("\nX_test:\n%s" % X_test)

In [ ]:
encoder = OneHotEncoder(categorical_features=[1, 2], sparse=False).fit(X_train)
encoder.transform(X_test)

In [ ]:
encoder = OneHotEncoder(categorical_features=[1, 2], sparse=False, n_values=[2, 3]).fit(X_train)
encoder.transform(X_test)

Also see pandas.get_dummies.


In [ ]: