In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/src/sample_pandas_normal.csv', index_col=0)
df_A, df_B = df[:3].copy(), df[3:].copy()

In [3]:
print(df_A)


         age state  point
name                     
Alice     24    NY     64
Bob       42    CA     92
Charlie   18    CA     70

In [4]:
print(df_B)


       age state  point
name                   
Dave    68    TX     70
Ellen   24    CA     88
Frank   30    NY     57

In [5]:
print(pd.get_dummies(df_A))


         age  point  state_CA  state_NY
name                                   
Alice     24     64         0         1
Bob       42     92         1         0
Charlie   18     70         1         0

In [6]:
print(pd.get_dummies(df_B))


       age  point  state_CA  state_NY  state_TX
name                                           
Dave    68     70         0         0         1
Ellen   24     88         1         0         0
Frank   30     57         0         1         0

In [7]:
categories = set(df_A['state'].unique().tolist() + df_B['state'].unique().tolist())
print(categories)


{'NY', 'TX', 'CA'}

In [8]:
df_A['state'] = pd.Categorical(df_A['state'], categories=categories)
df_B['state'] = pd.Categorical(df_B['state'], categories=categories)

In [9]:
print(df_A['state'].dtypes)


category

In [10]:
print(pd.get_dummies(df_A))


         age  point  state_NY  state_TX  state_CA
name                                             
Alice     24     64         1         0         0
Bob       42     92         0         0         1
Charlie   18     70         0         0         1

In [11]:
print(pd.get_dummies(df_B))


       age  point  state_NY  state_TX  state_CA
name                                           
Dave    68     70         0         1         0
Ellen   24     88         0         0         1
Frank   30     57         1         0         0

In [12]:
df = pd.read_csv('data/src/sample_pandas_normal.csv', index_col=0)
df_train, df_test = df[:3].copy(), df[3:].copy()

In [13]:
categories = df_train['state'].unique()

In [14]:
df_train['state'] = pd.Categorical(df_train['state'], categories=categories)
df_test['state'] = pd.Categorical(df_test['state'], categories=categories)

In [15]:
print(df_test)


       age state  point
name                   
Dave    68   NaN     70
Ellen   24    CA     88
Frank   30    NY     57

In [16]:
print(pd.get_dummies(df_train))


         age  point  state_NY  state_CA
name                                   
Alice     24     64         1         0
Bob       42     92         0         1
Charlie   18     70         0         1

In [17]:
print(pd.get_dummies(df_test))


       age  point  state_NY  state_CA
name                                 
Dave    68     70         0         0
Ellen   24     88         0         1
Frank   30     57         1         0

In [18]:
df = pd.read_csv('data/src/sample_pandas_normal.csv', index_col=0)
df_train, df_test = df[:3].copy(), df[3:].copy()

In [19]:
cols = df_train.select_dtypes('object').columns

In [20]:
for col in cols:
    categories = df_train[col].unique()
    df_train[col] = pd.Categorical(df_train[col], categories=categories)
    df_test[col] = pd.Categorical(df_test[col], categories=categories)

In [21]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [22]:
print(df_train)


         age  point  state_NY  state_CA
name                                   
Alice     24     64         1         0
Bob       42     92         0         1
Charlie   18     70         0         1

In [23]:
print(df_test)


       age  point  state_NY  state_CA
name                                 
Dave    68     70         0         0
Ellen   24     88         0         1
Frank   30     57         1         0