In [54]:
import pandas as pd

from pandas.api.types import CategoricalDtype

In [55]:
pd.__version__


Out[55]:
'1.0.3'

In [56]:
df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
df


Out[56]:
country
0 russia
1 germany
2 australia
3 korea
4 germany

One-hot encoding a column in a Pandas Dataframe


In [57]:
df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
pd.get_dummies(df,prefix=['country'])


Out[57]:
country_australia country_germany country_korea country_russia
0 0 0 0 1
1 0 1 0 0
2 1 0 0 0
3 0 0 1 0
4 0 1 0 0

One-hot encoding vs Dummy variables


In [58]:
df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
pd.get_dummies(df["country"], prefix='country', drop_first=True)


Out[58]:
country_germany country_korea country_russia
0 0 0 1
1 1 0 0
2 0 0 0
3 0 1 0
4 1 0 0

Add columns for categories that only appear in the test set


In [59]:
df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})
df["country"] = df["country"].astype(CategoricalDtype(["australia","germany","korea","russia","japan"]))

In [60]:
df


Out[60]:
country
0 russia
1 germany
2 australia
3 korea
4 germany

In [61]:
pd.get_dummies(df["country"],prefix='country')


Out[61]:
country_australia country_germany country_korea country_russia country_japan
0 0 0 0 1 0
1 0 1 0 0 0
2 1 0 0 0 0
3 0 0 1 0 0
4 0 1 0 0 0

Add dummy columns to dataframe


In [62]:
import pandas as pd

# df now has two columns: name and country
df = pd.DataFrame({
        'name': ['josef','michael','john','bawool','klaus'],
        'country': ['russia', 'germany', 'australia','korea','germany']
    })

# use pd.concat to join the new columns with your original dataframe
df = pd.concat([df,pd.get_dummies(df['country'], prefix='country')],axis=1)

# now drop the original 'country' column (you don't need it anymore)
df.drop(['country'],axis=1, inplace=True)

In [63]:
df


Out[63]:
name country_australia country_germany country_korea country_russia
0 josef 0 0 0 1
1 michael 0 1 0 0
2 john 1 0 0 0
3 bawool 0 0 1 0
4 klaus 0 1 0 0

Treat Nulls/NaNs as a separate category


In [64]:
import numpy as np

In [65]:
df = pd.DataFrame({
    'country': ['germany',np.nan,'germany','united kingdom','america','united kingdom']
})
df


Out[65]:
country
0 germany
1 NaN
2 germany
3 united kingdom
4 america
5 united kingdom

In [66]:
pd.get_dummies(df["country"],dummy_na=True)


Out[66]:
america germany united kingdom NaN
0 0 1 0 0
1 0 0 0 1
2 0 1 0 0
3 0 0 1 0
4 1 0 0 0
5 0 0 1 0

one-hot encode a single column


In [67]:
df = pd.DataFrame({
    'has_dogs':[True,False,True,True,False,True],
    'country': ['germany',np.nan,'germany','united kingdom','america','united kingdom']
})

In [68]:
pd.get_dummies(df)


Out[68]:
has_dogs country_america country_germany country_united kingdom
0 True 0 1 0
1 False 0 0 0
2 True 0 1 0
3 True 0 0 1
4 False 1 0 0
5 True 0 0 1

In [69]:
df = pd.DataFrame({'country': ['russia', 'germany', 'australia','korea','germany']})

pd.get_dummies(df,prefix=['country'], drop_first=True)


Out[69]:
country_germany country_korea country_russia
0 0 0 1
1 1 0 0
2 0 0 0
3 0 1 0
4 1 0 0