Updated: Using Pandas to Create and Excel Diff

The original article contains some Updating the Excel diff article to work with more recent versions of pandas that no longer use panel.

The new article can be read here



In [1]:

    
import pandas as pd



In [2]:

    
# Define the diff function to show the changes in each field
def report_diff(x):
    return x[0] if x[0] == x[1] else '{} ---> {}'.format(*x)



In [3]:

    
# Read in the two files but call the data old and new and create columns to track
old = pd.read_excel('sample-address-1.xlsx', 'Sheet1', na_values=['NA'])
new = pd.read_excel('sample-address-2.xlsx', 'Sheet1', na_values=['NA'])
old['version'] = "old"
new['version'] = "new"



In [4]:

    
old.head()









    Out[4]:







  
    
      
      account number
      name
      street
      city
      state
      postal code
      version
    
  
  
    
      0
      935480
      Bruen Group
      5131 Nienow Viaduct Apt. 290
      Port Arlie
      Alabama
      14118
      old
    
    
      1
      371770
      Cruickshank-Boyer
      839 Lana Expressway Suite 234
      South Viviana
      Alabama
      57838
      old
    
    
      2
      548367
      Spencer, Grady and Herman
      65387 Lang Circle Apt. 516
      Greenholtbury
      Alaska
      58394
      old
    
    
      3
      296620
      Schamberger, Hagenes and Brown
      26340 Ferry Neck Apt. 612
      McCulloughstad
      Alaska
      74052
      old
    
    
      4
      132971
      Williamson, Schumm and Hettinger
      89403 Casimer Spring
      Jeremieburgh
      Arkansas
      62785
      old



In [5]:

    
new.head()









    Out[5]:







  
    
      
      account number
      name
      street
      city
      state
      postal code
      version
    
  
  
    
      0
      935480
      Bruen Group
      5131 Nienow Viaduct Apt. 290
      Port Arlie
      Alabama
      14118
      new
    
    
      1
      371770
      Cruickshank-Boyer
      839 Lana Expressway Suite 234
      South Viviana
      Alabama
      57838
      new
    
    
      2
      548367
      Spencer, Grady and Herman
      65387 Lang Circle Apt. 516
      Greenholtbury
      Alaska
      58394
      new
    
    
      3
      132971
      Williamson, Schumm and Hettinger
      89403 Casimer Spring
      Jeremieburgh
      Arkansas
      62785
      new
    
    
      4
      985603
      Bosco-Upton
      03369 Moe Way
      Port Casandra
      Arkansas
      86014
      new



In [6]:

    
# We use the account numbers as the keys to check what is added, dropped and potentially changed
# Using sets makes the deduping easy and we can use set operations to figure out groupings
old_accts_all = set(old['account number'])
new_accts_all = set(new['account number'])

dropped_accts = old_accts_all - new_accts_all
added_accts = new_accts_all - old_accts_all



In [7]:

    
#Join all the data together and ignore indexes so it all gets concatenated
all_data = pd.concat([old,new],ignore_index=True)



In [8]:

    
all_data.head()









    Out[8]:







  
    
      
      account number
      name
      street
      city
      state
      postal code
      version
    
  
  
    
      0
      935480
      Bruen Group
      5131 Nienow Viaduct Apt. 290
      Port Arlie
      Alabama
      14118
      old
    
    
      1
      371770
      Cruickshank-Boyer
      839 Lana Expressway Suite 234
      South Viviana
      Alabama
      57838
      old
    
    
      2
      548367
      Spencer, Grady and Herman
      65387 Lang Circle Apt. 516
      Greenholtbury
      Alaska
      58394
      old
    
    
      3
      296620
      Schamberger, Hagenes and Brown
      26340 Ferry Neck Apt. 612
      McCulloughstad
      Alaska
      74052
      old
    
    
      4
      132971
      Williamson, Schumm and Hettinger
      89403 Casimer Spring
      Jeremieburgh
      Arkansas
      62785
      old



In [9]:

    
# Let's see what changes in the main columns we care about
# Change drop_duplicates syntax: keep=last
changes = all_data.drop_duplicates(subset=["account number", 
                                           "name", "street", 
                                           "city","state", 
                                           "postal code"], keep='last')



In [10]:

    
changes.head()









    Out[10]:







  
    
      
      account number
      name
      street
      city
      state
      postal code
      version
    
  
  
    
      3
      296620
      Schamberger, Hagenes and Brown
      26340 Ferry Neck Apt. 612
      McCulloughstad
      Alaska
      74052
      old
    
    
      24
      595932
      Kuhic, Eichmann and West
      4059 Tobias Inlet
      New Rylanfurt
      Illinois
      89271
      old
    
    
      30
      558879
      Watsica Group
      95616 Enos Grove Suite 139
      West Atlas
      Iowa
      47419
      old
    
    
      96
      880043
      Beatty Inc
      3641 Schaefer Isle Suite 171
      North Gardnertown
      Wyoming
      64318
      old
    
    
      100
      935480
      Bruen Group
      5131 Nienow Viaduct Apt. 290
      Port Arlie
      Alabama
      14118
      new



In [11]:

    
#Get all the duplicate rows
dupe_accts = changes[changes['account number'].duplicated() == True]['account number'].tolist()
dupes = changes[changes["account number"].isin(dupe_accts)]



In [12]:

    
dupes









    Out[12]:







  
    
      
      account number
      name
      street
      city
      state
      postal code
      version
    
  
  
    
      24
      595932
      Kuhic, Eichmann and West
      4059 Tobias Inlet
      New Rylanfurt
      Illinois
      89271
      old
    
    
      30
      558879
      Watsica Group
      95616 Enos Grove Suite 139
      West Atlas
      Iowa
      47419
      old
    
    
      96
      880043
      Beatty Inc
      3641 Schaefer Isle Suite 171
      North Gardnertown
      Wyoming
      64318
      old
    
    
      123
      595932
      Kuhic, Eichmann and West
      4059 Tobias St
      New Rylanfurt
      Illinois
      89271
      new
    
    
      129
      558879
      Watsica Group
      829 Big street
      Smithtown
      Ohio
      47919
      new
    
    
      195
      880043
      Beatty Inc
      3641 Schaefer Isle Suite 171
      North Gardnertown
      Wyoming
      64918
      new



In [13]:

    
# Pull out the old and new data into separate dataframes
change_new = dupes[(dupes["version"] == "new")]
change_old = dupes[(dupes["version"] == "old")]



In [14]:

    
# Drop the temp columns - we don't need them now
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)



In [15]:

    
# Index on the account numbers
change_new.set_index('account number', inplace=True)
change_old.set_index('account number', inplace=True)



In [16]:

    
df_all_changes = pd.concat([change_old, change_new],
                           axis='columns',
                           keys=['old', 'new'],
                           join='outer')



In [17]:

    
df_all_changes









    Out[17]:







  
    
      
      old
      new
    
    
      
      name
      street
      city
      state
      postal code
      name
      street
      city
      state
      postal code
    
    
      account number
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      595932
      Kuhic, Eichmann and West
      4059 Tobias Inlet
      New Rylanfurt
      Illinois
      89271
      Kuhic, Eichmann and West
      4059 Tobias St
      New Rylanfurt
      Illinois
      89271
    
    
      558879
      Watsica Group
      95616 Enos Grove Suite 139
      West Atlas
      Iowa
      47419
      Watsica Group
      829 Big street
      Smithtown
      Ohio
      47919
    
    
      880043
      Beatty Inc
      3641 Schaefer Isle Suite 171
      North Gardnertown
      Wyoming
      64318
      Beatty Inc
      3641 Schaefer Isle Suite 171
      North Gardnertown
      Wyoming
      64918



In [18]:

    
df_all_changes = df_all_changes.swaplevel(axis='columns')[change_new.columns[0:]]



In [19]:

    
df_all_changes









    Out[19]:







  
    
      
      name
      street
      city
      state
      postal code
    
    
      
      old
      new
      old
      new
      old
      new
      old
      new
      old
      new
    
    
      account number
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      595932
      Kuhic, Eichmann and West
      Kuhic, Eichmann and West
      4059 Tobias Inlet
      4059 Tobias St
      New Rylanfurt
      New Rylanfurt
      Illinois
      Illinois
      89271
      89271
    
    
      558879
      Watsica Group
      Watsica Group
      95616 Enos Grove Suite 139
      829 Big street
      West Atlas
      Smithtown
      Iowa
      Ohio
      47419
      47919
    
    
      880043
      Beatty Inc
      Beatty Inc
      3641 Schaefer Isle Suite 171
      3641 Schaefer Isle Suite 171
      North Gardnertown
      North Gardnertown
      Wyoming
      Wyoming
      64318
      64918



In [20]:

    
df_changed = df_all_changes.groupby(level=0, axis=1).apply(lambda frame: frame.apply(report_diff, axis=1))
df_changed = df_changed.reset_index()



In [21]:

    
df_changed









    Out[21]:







  
    
      
      account number
      city
      name
      postal code
      state
      street
    
  
  
    
      0
      595932
      New Rylanfurt
      Kuhic, Eichmann and West
      89271
      Illinois
      4059 Tobias Inlet ---> 4059 Tobias St
    
    
      1
      558879
      West Atlas ---> Smithtown
      Watsica Group
      47419 ---> 47919
      Iowa ---> Ohio
      95616 Enos Grove Suite 139 ---> 829 Big street
    
    
      2
      880043
      North Gardnertown
      Beatty Inc
      64318 ---> 64918
      Wyoming
      3641 Schaefer Isle Suite 171



In [22]:

    
# Diff'ing is done, we need to get a list of removed and added items



In [23]:

    
df_removed = changes[changes["account number"].isin(dropped_accts)]
df_removed









    Out[23]:







  
    
      
      account number
      name
      street
      city
      state
      postal code
      version
    
  
  
    
      3
      296620
      Schamberger, Hagenes and Brown
      26340 Ferry Neck Apt. 612
      McCulloughstad
      Alaska
      74052
      old



In [24]:

    
df_added = changes[changes["account number"].isin(added_accts)]
df_added









    Out[24]:







  
    
      
      account number
      name
      street
      city
      state
      postal code
      version
    
  
  
    
      199
      34777
      MyCo
      7833 Old Pine Drive
      Orlando
      Florida
      32789
      new



In [25]:

    
#Save the changes to excel but only include the columns we care about
output_columns = ["account number", "name", "street", "city", "state", "postal code"]
writer = pd.ExcelWriter("my-diff.xlsx")
df_changed.to_excel(writer,"changed", index=False, columns=output_columns)
df_removed.to_excel(writer,"removed",index=False, columns=output_columns)
df_added.to_excel(writer,"added",index=False, columns=output_columns)
writer.save()



In [ ]:

	account number	name	street	city	state	postal code	version
0	935480	Bruen Group	5131 Nienow Viaduct Apt. 290	Port Arlie	Alabama	14118	old
1	371770	Cruickshank-Boyer	839 Lana Expressway Suite 234	South Viviana	Alabama	57838	old
2	548367	Spencer, Grady and Herman	65387 Lang Circle Apt. 516	Greenholtbury	Alaska	58394	old
3	296620	Schamberger, Hagenes and Brown	26340 Ferry Neck Apt. 612	McCulloughstad	Alaska	74052	old
4	132971	Williamson, Schumm and Hettinger	89403 Casimer Spring	Jeremieburgh	Arkansas	62785	old

	account number	name	street	city	state	postal code	version
24	595932	Kuhic, Eichmann and West	4059 Tobias Inlet	New Rylanfurt	Illinois	89271	old
30	558879	Watsica Group	95616 Enos Grove Suite 139	West Atlas	Iowa	47419	old
96	880043	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64318	old
123	595932	Kuhic, Eichmann and West	4059 Tobias St	New Rylanfurt	Illinois	89271	new
129	558879	Watsica Group	829 Big street	Smithtown	Ohio	47919	new
195	880043	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64918	new

	old					new
	name	street	city	state	postal code	name	street	city	state	postal code
account number
595932	Kuhic, Eichmann and West	4059 Tobias Inlet	New Rylanfurt	Illinois	89271	Kuhic, Eichmann and West	4059 Tobias St	New Rylanfurt	Illinois	89271
558879	Watsica Group	95616 Enos Grove Suite 139	West Atlas	Iowa	47419	Watsica Group	829 Big street	Smithtown	Ohio	47919
880043	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64318	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64918