In [2]:
import pandas as pd
import numpy as np

pd.__version__, np.__version__


Out[2]:
('0.24.2', '1.16.3')

In [12]:
df_employees = pd.DataFrame({
    'id':[1,2,3,4],
    'name':['alice','bob','charlie','david'],
    'company_id':[1,2,1,2]
})
df_employees


Out[12]:
id name company_id
0 1 alice 1
1 2 bob 2
2 3 charlie 1
3 4 david 2

In [14]:
df_companies = pd.DataFrame({
    'id':[1,2],
    'name': ['bell labs', 'xerox']
})
df_companies


Out[14]:
id name
0 1 bell labs
1 2 xerox

merge on multiple columns


In [35]:
df_employees_sal = pd.DataFrame({
    'year':[1980,1981,1980,1981,1980,1981,1980,1981],
    'id':[1,1,2,2,3,3,4,4],
    'name':['alice','alice','bob','bob','charlie','charlie','david','david'],
    'salary':[30000,30000,40000,41000,35000,40000,45000,45000],
    'company_id':[1,1,2,2,1,1,2,2]
})
df_employees_sal.sort_values(by=['year','name'])


Out[35]:
year id name salary company_id
0 1980 1 alice 30000 1
2 1980 2 bob 40000 2
4 1980 3 charlie 35000 1
6 1980 4 david 45000 2
1 1981 1 alice 30000 1
3 1981 2 bob 41000 2
5 1981 3 charlie 40000 1
7 1981 4 david 45000 2

In [36]:
df_companies_rev = pd.DataFrame({
    'year':[1980,1981,1980,1981],
    'id':[1,1,2,2],
    'name':['bell labs','bell labs','xerox','xerox'],
    'revenue':[1130000,1130000,5000000,500000]
})
df_companies_rev.sort_values(by=['year','name'])


Out[36]:
year id name revenue
0 1980 1 bell labs 1130000
2 1980 2 xerox 5000000
1 1981 1 bell labs 1130000
3 1981 2 xerox 500000

In [45]:
pd.merge(
    df_employees_sal,
    df_companies_rev,
    left_on=['year','company_id'],
    right_on=['year','id']
).rename(columns={'id_x':'id','name_x':'name','name_y':'company_name'}).drop(['id_y','company_id'],axis=1).sort_values(
by=['year','name'])


Out[45]:
year id name salary company_name revenue
0 1980 1 alice 30000 bell labs 1130000
4 1980 2 bob 40000 xerox 5000000
1 1980 3 charlie 35000 bell labs 1130000
5 1980 4 david 45000 xerox 5000000
2 1981 1 alice 30000 bell labs 1130000
6 1981 2 bob 41000 xerox 500000
3 1981 3 charlie 40000 bell labs 1130000
7 1981 4 david 45000 xerox 500000

rename duplicate columns


In [44]:
pd.merge(
    df_employees_sal,
    df_companies_rev,
    left_on=['year','company_id'],
    right_on=['year','id'],
    suffixes=('_left','_right')    
)


Out[44]:
year id_left name_left salary company_id id_right name_right revenue
0 1980 1 alice 30000 1 1 bell labs 1130000
1 1980 3 charlie 35000 1 1 bell labs 1130000
2 1981 1 alice 30000 1 1 bell labs 1130000
3 1981 3 charlie 40000 1 1 bell labs 1130000
4 1980 2 bob 40000 2 2 xerox 5000000
5 1980 4 david 45000 2 2 xerox 5000000
6 1981 2 bob 41000 2 2 xerox 500000
7 1981 4 david 45000 2 2 xerox 500000

In [ ]: