Joining and merging dataframes

https://youtu.be/XMjSGGej9y8


In [6]:
import pandas as pd;

df1 = pd.DataFrame({'HPI':[80,85,88,85],
                    'Int_rate':[2, 3, 2, 2],
                    'US_GDP_Thousands':[50, 55, 65, 55]},
                   index = [2001, 2002, 2003, 2004])

df2 = pd.DataFrame({'HPI':[80,85,88,85],
                    'Int_rate':[2, 3, 2, 2],
                    'US_GDP_Thousands':[50, 55, 65, 55]},
                   index = [2005, 2006, 2007, 2008])

df3 = pd.DataFrame({'HPI':[80,85,88,85],
                    'Unemployment':[7, 8, 9, 6],
                    'Low_tier_HPI':[50, 52, 50, 53]},
                   index = [2001, 2002, 2003, 2004])

In [12]:
print(pd.merge(df1, df2, on = "HPI")); #like SQL merge


   HPI  Int_rate_x  US_GDP_Thousands_x  Int_rate_y  US_GDP_Thousands_y
0   80           2                  50           2                  50
1   85           3                  55           3                  55
2   85           3                  55           2                  55
3   85           2                  55           3                  55
4   85           2                  55           2                  55
5   88           2                  65           2                  65

In [13]:
print(pd.merge(df1, df2, on = ["HPI", "Int_rate"]));


   HPI  Int_rate  US_GDP_Thousands_x  US_GDP_Thousands_y
0   80         2                  50                  50
1   85         3                  55                  55
2   88         2                  65                  65
3   85         2                  55                  55

In [15]:
df1_indexed = df1.set_index("HPI");
df3_indexed = df3.set_index("HPI");
joindex = df1_indexed.join(df3_indexed);

print(joindex);
print(pd.merge(df1, df3, on = "HPI"));


     Int_rate  US_GDP_Thousands  Low_tier_HPI  Unemployment
HPI                                                        
80          2                50            50             7
85          3                55            52             8
85          3                55            53             6
85          2                55            52             8
85          2                55            53             6
88          2                65            50             9
   HPI  Int_rate  US_GDP_Thousands  Low_tier_HPI  Unemployment
0   80         2                50            50             7
1   85         3                55            52             8
2   85         3                55            53             6
3   85         2                55            52             8
4   85         2                55            53             6
5   88         2                65            50             9

In [17]:
ndf1 = pd.DataFrame({'Year':[2001, 2002, 2003, 2004],
                    'Int_rate':[2, 3, 2, 2],
                    'US_GDP_Thousands':[50, 55, 65, 55]});

ndf2 = pd.DataFrame({'Year':[2001, 2003, 2004, 2005],
                    'Unemployment':[7, 8, 9, 6],
                    'Low_tier_HPI':[50, 52, 50, 53]});

merged = pd.merge(ndf1, ndf2, on = "Year");
merged.set_index("Year", inplace = True);
print(merged);


      Int_rate  US_GDP_Thousands  Low_tier_HPI  Unemployment
Year                                                        
2001         2                50            50             7
2003         2                65            52             8
2004         2                55            50             9

In [18]:
merged = pd.merge(ndf1, ndf2, on = "Year", how = "left");
merged.set_index("Year", inplace = True);
print(merged);


      Int_rate  US_GDP_Thousands  Low_tier_HPI  Unemployment
Year                                                        
2001         2                50          50.0           7.0
2002         3                55           NaN           NaN
2003         2                65          52.0           8.0
2004         2                55          50.0           9.0

In [19]:
merged = pd.merge(ndf1, ndf2, on = "Year", how = "right");
merged.set_index("Year", inplace = True);
print(merged);


      Int_rate  US_GDP_Thousands  Low_tier_HPI  Unemployment
Year                                                        
2001       2.0              50.0            50             7
2003       2.0              65.0            52             8
2004       2.0              55.0            50             9
2005       NaN               NaN            53             6

In [20]:
merged = pd.merge(ndf1, ndf2, on = "Year", how = "outer");
merged.set_index("Year", inplace = True);
print(merged);


      Int_rate  US_GDP_Thousands  Low_tier_HPI  Unemployment
Year                                                        
2001       2.0              50.0          50.0           7.0
2002       3.0              55.0           NaN           NaN
2003       2.0              65.0          52.0           8.0
2004       2.0              55.0          50.0           9.0
2005       NaN               NaN          53.0           6.0

In [21]:
merged = pd.merge(ndf1, ndf2, on = "Year", how = "inner"); #default
merged.set_index("Year", inplace = True);
print(merged);


      Int_rate  US_GDP_Thousands  Low_tier_HPI  Unemployment
Year                                                        
2001         2                50            50             7
2003         2                65            52             8
2004         2                55            50             9

In [ ]:
# Use merge when index doesn't matter smth to you, and join when it does.