The original article contains some Updating the Excel diff article to work with more recent versions of pandas that no longer use panel.
The new article can be read here
In [1]:
import pandas as pd
In [2]:
# Define the diff function to show the changes in each field
def report_diff(x):
return x[0] if x[0] == x[1] else '{} ---> {}'.format(*x)
In [3]:
# Read in the two files but call the data old and new and create columns to track
old = pd.read_excel('sample-address-1.xlsx', 'Sheet1', na_values=['NA'])
new = pd.read_excel('sample-address-2.xlsx', 'Sheet1', na_values=['NA'])
old['version'] = "old"
new['version'] = "new"
In [4]:
old.head()
Out[4]:
In [5]:
new.head()
Out[5]:
In [6]:
# We use the account numbers as the keys to check what is added, dropped and potentially changed
# Using sets makes the deduping easy and we can use set operations to figure out groupings
old_accts_all = set(old['account number'])
new_accts_all = set(new['account number'])
dropped_accts = old_accts_all - new_accts_all
added_accts = new_accts_all - old_accts_all
In [7]:
#Join all the data together and ignore indexes so it all gets concatenated
all_data = pd.concat([old,new],ignore_index=True)
In [8]:
all_data.head()
Out[8]:
In [9]:
# Let's see what changes in the main columns we care about
# Change drop_duplicates syntax: keep=last
changes = all_data.drop_duplicates(subset=["account number",
"name", "street",
"city","state",
"postal code"], keep='last')
In [10]:
changes.head()
Out[10]:
In [11]:
#Get all the duplicate rows
dupe_accts = changes[changes['account number'].duplicated() == True]['account number'].tolist()
dupes = changes[changes["account number"].isin(dupe_accts)]
In [12]:
dupes
Out[12]:
In [13]:
# Pull out the old and new data into separate dataframes
change_new = dupes[(dupes["version"] == "new")]
change_old = dupes[(dupes["version"] == "old")]
In [14]:
# Drop the temp columns - we don't need them now
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)
In [15]:
# Index on the account numbers
change_new.set_index('account number', inplace=True)
change_old.set_index('account number', inplace=True)
In [16]:
df_all_changes = pd.concat([change_old, change_new],
axis='columns',
keys=['old', 'new'],
join='outer')
In [17]:
df_all_changes
Out[17]:
In [18]:
df_all_changes = df_all_changes.swaplevel(axis='columns')[change_new.columns[0:]]
In [19]:
df_all_changes
Out[19]:
In [20]:
df_changed = df_all_changes.groupby(level=0, axis=1).apply(lambda frame: frame.apply(report_diff, axis=1))
df_changed = df_changed.reset_index()
In [21]:
df_changed
Out[21]:
In [22]:
# Diff'ing is done, we need to get a list of removed and added items
In [23]:
df_removed = changes[changes["account number"].isin(dropped_accts)]
df_removed
Out[23]:
In [24]:
df_added = changes[changes["account number"].isin(added_accts)]
df_added
Out[24]:
In [25]:
#Save the changes to excel but only include the columns we care about
output_columns = ["account number", "name", "street", "city", "state", "postal code"]
writer = pd.ExcelWriter("my-diff.xlsx")
df_changed.to_excel(writer,"changed", index=False, columns=output_columns)
df_removed.to_excel(writer,"removed",index=False, columns=output_columns)
df_added.to_excel(writer,"added",index=False, columns=output_columns)
writer.save()
In [ ]: