In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [3]:
df1 = DataFrame({'key': ['X','Z','Y','Z','X','X'], 'data_set_1': np.arange(6)})

df1


Out[3]:
data_set_1 key
0 0 X
1 1 Z
2 2 Y
3 3 Z
4 4 X
5 5 X

In [4]:
df2 = DataFrame({'key': ['Q','Y','Z'], 'data_set_2': [1,2,3]})

df2


Out[4]:
data_set_2 key
0 1 Q
1 2 Y
2 3 Z

In [5]:
pd.merge(df1,df2)


Out[5]:
data_set_1 key data_set_2
0 1 Z 3
1 3 Z 3
2 2 Y 2

In [6]:
# merge using specific column
# this is equivalent to the last line
pd.merge(df1,df2,on='key')


Out[6]:
data_set_1 key data_set_2
0 1 Z 3
1 3 Z 3
2 2 Y 2

In [7]:
pd.merge(df1,df2,on='key',how='left')


Out[7]:
data_set_1 key data_set_2
0 0 X NaN
1 1 Z 3.0
2 2 Y 2.0
3 3 Z 3.0
4 4 X NaN
5 5 X NaN

In [8]:
pd.merge(df1,df2,on='key',how='right')


Out[8]:
data_set_1 key data_set_2
0 1.0 Z 3
1 3.0 Z 3
2 2.0 Y 2
3 NaN Q 1

In [9]:
pd.merge(df1,df2,on='key',how='outer')


Out[9]:
data_set_1 key data_set_2
0 0.0 X NaN
1 4.0 X NaN
2 5.0 X NaN
3 1.0 Z 3.0
4 3.0 Z 3.0
5 2.0 Y 2.0
6 NaN Q 1.0

In [11]:
# many to many merge
df3 = DataFrame({'key':['X','X','X','Y','Z','Z'],'data_set_3':range(6)})

df3


Out[11]:
data_set_3 key
0 0 X
1 1 X
2 2 X
3 3 Y
4 4 Z
5 5 Z

In [12]:
df4 = DataFrame({'key':['Y','Y','X','X','Z'],'data_set_4': range(5)})

df4


Out[12]:
data_set_4 key
0 0 Y
1 1 Y
2 2 X
3 3 X
4 4 Z

In [13]:
pd.merge(df3,df4)


Out[13]:
data_set_3 key data_set_4
0 0 X 2
1 0 X 3
2 1 X 2
3 1 X 3
4 2 X 2
5 2 X 3
6 3 Y 0
7 3 Y 1
8 4 Z 4
9 5 Z 4

In [15]:
df_left = DataFrame({'key1':['SF','SF','LA'],
                     'key2':['one','two','one'],
                     'left_data':[10,20,30]})

df_left


Out[15]:
key1 key2 left_data
0 SF one 10
1 SF two 20
2 LA one 30

In [16]:
df_right = DataFrame({'key1':['SF','SF','LA','LA'],
                     'key2':['one','one','one','two'],
                     'right_data':[40,50,60,70]})

df_right


Out[16]:
key1 key2 right_data
0 SF one 40
1 SF one 50
2 LA one 60
3 LA two 70

In [17]:
pd.merge(df_left,df_right,on=['key1','key2'],how='outer')


Out[17]:
key1 key2 left_data right_data
0 SF one 10.0 40.0
1 SF one 10.0 50.0
2 SF two 20.0 NaN
3 LA one 30.0 60.0
4 LA two NaN 70.0

In [19]:
# pandas by default, will keep columns with matching names
pd.merge(df_left,df_right,on='key1')


Out[19]:
key1 key2_x left_data key2_y right_data
0 SF one 10 one 40
1 SF one 10 one 50
2 SF two 20 one 40
3 SF two 20 one 50
4 LA one 30 one 60
5 LA one 30 two 70

In [21]:
# specify suffixes
pd.merge(df_left,df_right,on='key1',suffixes=('_lefty','_righty'))


Out[21]:
key1 key2_lefty left_data key2_righty right_data
0 SF one 10 one 40
1 SF one 10 one 50
2 SF two 20 one 40
3 SF two 20 one 50
4 LA one 30 one 60
5 LA one 30 two 70

In [ ]: