In [27]:
import pandas as pd
import openpyxl as oxl
from subprocess import call
from os import remove, chdir, path, getcwd
from io import BytesIO
from sys import argv
from zipfile import ZipFile, ZIP_DEFLATED
import lxml
In [3]:
# TODO: move this to .settings to add clarity for future editor
WD_report_name = "EXP031-RPT-Process-Accruals_with_Expense_Report.xlsx"
WD2_report_name = "EXP032-RPT-Process-Accruals-_No_Expense.xlsx"
wrangled_WD_report_name = "EXP031_plus_EXP032_wrangled.csv"
wrangled_WD2_report_name = "EXP032_wrangled.csv"
MF_JE_template_name = "MF_JE_template.xlsx"
master_file_name = "WD_Accruals_Master.xlsx"
input_folder_path = "../Script/Input/"
generic_GL_account = 46540000
In [4]:
def generate_vbs_script():
# this VBS script converts XLSX files to CSV format for faster processing
vbscript = """if WScript.Arguments.Count < 3 Then
WScript.Echo "Please specify the source and the destination files. Usage: ExcelToCsv <xls/xlsx source file> <csv destination file> <worksheet number (starts at 1)>"
Wscript.Quit
End If
csv_format = 6
Set objFSO = CreateObject("Scripting.FileSystemObject")
src_file = objFSO.GetAbsolutePathName(Wscript.Arguments.Item(0))
dest_file = objFSO.GetAbsolutePathName(WScript.Arguments.Item(1))
worksheet_number = CInt(WScript.Arguments.Item(2))
Dim oExcel
Set oExcel = CreateObject("Excel.Application")
Dim oBook
Set oBook = oExcel.Workbooks.Open(src_file)
oBook.Worksheets(worksheet_number).Activate
oBook.SaveAs dest_file, csv_format
oBook.Close False
oExcel.Quit
"""
try:
with open("ExcelToCsv.vbs", "wb") as f:
f.write(vbscript.encode("utf-8"))
except Exception as e:
print(e.msg)
print("VBS script for converting xlsx files to csv could not be generated.")
In [5]:
def load_csv(xlsx_file_path, has_sheets=False, skiprows=None, usecols=None):
# this function maps the generate_vbs_script() function to the input XLSX file
if has_sheets:
# sheet numbers to use; using the first three and I don't know how to retrieve no of sheets, hence the fixed numbers
sheets = map(str, range(1, 3))
sheet_dataframes = []
for sheet in sheets:
csv_file_path = "../Script/{}{}{}".format(input_folder_path, sheet, ".csv")
call(["cscript.exe", "../Script/ExcelToCsv.vbs", xlsx_file_path, csv_file_path, sheet, r"//B"])
try:
sheet_dataframe = pd.read_csv(csv_file_path, encoding="latin-1", engine="c", usecols=usecols)
except Exception as e:
print(e.msg)
print("Sheets could not be converted to CSV format.")
sheet_dataframes.append(sheet_dataframe)
return tuple(sheet_dataframes)
else:
csv_file_path = "{}{}".format(xlsx_file_path[:-4], "csv")
# //B is for batch mode; this is to avoid spam on the console :)
call(["cscript.exe", "../Script/ExcelToCsv.vbs", xlsx_file_path, csv_file_path, str(1), r"//B"])
if skiprows:
try:
data = pd.read_csv(csv_file_path, skiprows=skiprows, encoding="latin-1", engine="c", usecols=usecols)
except Exception as e:
print(e.msg)
print("Something went wrong... make sure report names weren't changed or debug the load_csv function")
else:
try:
data = pd.read_csv(csv_file_path, encoding="latin-1", engine="c", usecols=usecols)
except Exception as e:
print(e.msg)
print("Something went wrong... make sure report names weren't changed or debug the load_csv function")
return data
In [28]:
def load_all():
file_names = [WD_report_name, WD2_report_name, MF_JE_template_name, master_file_name]
dataframes = []
WD1_required_cols = ["Entity Code", "Cost Center", "Expense Report Number", "Expense Item", "Net Amount LC"]
WD2_required_cols = ["Transaction ID", "Billing Amount", "Currency", "Report Cost Location"]
# the script will be used by load_csv() to convert XLSX to CSV for faster processing
generate_vbs_script()
for file_name in file_names:
if file_name == WD_report_name:
usecols = WD1_required_cols
skiprows = [0]
elif file_name == WD2_report_name:
usecols = WD2_required_cols
elif file_name == MF_JE_template_name:
with open("{}{}".format(input_folder_path, MF_JE_template_name), "rb") as f:
in_mem_file = BytesIO(f.read())
MF_JE_template = oxl.load_workbook(in_mem_file)
dataframes.append(MF_JE_template)
continue
else:
# this will produce two CSVs from the two first two sheets of WD_Accruals_Master.xlsm
cc_to_ba, accounts = load_csv("{}{}".format(input_folder_path, file_name), has_sheets=True)
dataframes.extend([cc_to_ba, accounts])
break
df = load_csv("{}{}".format(input_folder_path, file_name), skiprows=skiprows, usecols=usecols)
dataframes.append(df)
# resetting params
usecols = skiprows = None
return dataframes
In [7]:
def collect_garbage():
# remove no longer needed files
WD_report_byproduct = "{}{}".format(WD_report_name[:-5], ".csv")
WD2_report_byproduct = "{}{}".format(WD2_report_name[:-5], ".csv")
excel_to_csv_macro_byproducts = ["1.csv", "2.csv", WD_report_byproduct, WD2_report_byproduct]
for byproduct in excel_to_csv_macro_byproducts:
remove("{}{}".format(input_folder_path, byproduct))
remove("ExcelToCsv.vbs")
In [8]:
def initial_cleanup():
# TODO: deal with scientific-notation-like business areas converting to sci-notation
global WD_report, WD2_report
collect_garbage()
# remove rows with total amount 0 or less / unfortunately, pandas nor Python are able to convert amounts in the format:
# 123,456.00 to float, hence need to either use localization (bad idea as the process is WW), or use below workaround
try:
WD_report["Net Amount LC"] = WD_report["Net Amount LC"].apply(lambda x: x.replace(",", "") if type(x) != float else x)
except:
pass
WD_report["Net Amount LC"] = WD_report["Net Amount LC"].map(float)
WD_report = WD_report[WD_report["Net Amount LC"] > 0]
try:
WD2_report["Billing Amount"] = WD2_report["Billing Amount"].apply(lambda x: x.replace(",", "") if type(x) != float else x)
except:
pass
# for card expenses, negative amounts are put in parentheses, e.g. (100.00); below line removes lines with such amounts
WD2_report = WD2_report[WD2_report["Billing Amount"].apply(lambda x: "(" not in x)]
WD2_report["Billing Amount"] = WD2_report["Billing Amount"].map(float)
# filer out lines with missing cost center/cost location, as this data is critical to generating an accrual
WD_report.dropna(subset=["Cost Center"], inplace=True)
WD2_report.dropna(subset=["Report Cost Location"], inplace=True)
# delete the duplicate cost centers/descriptions inside Cost Center/Cost Location column
WD_report["Cost Center"] = WD_report["Cost Center"].astype("str").map(lambda x: x.split()[0])
WD2_report["Report Cost Location"] = WD2_report["Report Cost Location"].astype("str").map(lambda x: x.split()[0])
# add "Company code" column as it will be used by generate_output() to generate a separate folder for each company code
WD2_report["Company code"] = WD2_report["Report Cost Location"].apply(lambda x: x[:4])
WD_report = WD_report[WD_report["Expense Report Number"].apply(lambda x: "Cancelled" not in x)]
In [9]:
def vlookup(report, what, left_on, right_on):
merged = report.merge(what, left_on=left_on, right_on=right_on, how="left")
return merged
In [10]:
def run_vlookups():
global WD_report, WD2_report
accounts = pd.DataFrame(accounts_file["Account"]).astype(int)
master_data_to_join = master_data_file[["Business Area", "Profit Center", "MRU", "Functional Area"]]
WD_report = vlookup(WD_report, accounts, left_on=[WD_report["Expense Item"], WD_report["Entity Code"]], right_on=[accounts_file["Expense Item name"], accounts_file["Subsidiary"]])
# the account number is provided separately for each country. However, all countries have the same account for a given category, so we need to remove these duplicate rows.
# in case any country has a separate account for a given category in the future, the script will still work
WD_report = vlookup(WD_report, master_data_to_join, WD_report["Cost Center"], master_data_file["Cost Center"])
WD2_report = vlookup(WD2_report, master_data_to_join, WD2_report["Report Cost Location"], master_data_file["Cost Center"])
In [11]:
def final_cleanup():
global WD_report
global accounts_file
travel_journal_item_account = 46540000
company_celebration_account = 46900000
german_debit_account = 46920000
# add vlookup exceptions
no_of_items = WD_report.shape[0]
for row_index in range(no_of_items):
category = str(WD_report["Expense Item"].iloc[row_index]) # for some reason this column is loaded as float, hence the str()
if "Travel Journal Item" in category:
WD_report.loc[row_index, "Account"] = travel_journal_item_account
# WD_report.set_value(index, "Acc#", travel_journal_item_account)
if "Company Celebration" in category:
# WD_report.set_value(index, "Acc#", company_celebration_account)
WD_report.loc[row_index, "Account"] = company_celebration_account
# controllership requirement: change all 9999 BA's to 1019, 1059 to 1015
WD_report["Business Area"] = WD_report["Business Area"].apply(lambda x: "1019" if str(x) == "9999" else x)
WD_report["Business Area"] = WD_report["Business Area"].apply(lambda x: "1015" if str(x) == "1059" else x)
# this is to stop Excel from reading e.g. 2E00 as a number in scientific notation
WD_report["Business Area"] = WD_report["Business Area"].map(str)
# note that this also overrides the above two exceptions, which are changed to the german account
WD_report.loc[WD_report["Entity Code"] == "DESA", "Account"] = german_debit_account
# ensure that account number is provided, and that it is an integer
try:
WD_report["Account"] = WD_report["Account"].map(int)
except:
# this means that some account numbers were not found for a company code-category combination -> use an account
# for the same category, but another company code (all CCs should use the same account)
lines_with_missing_account = WD_report[WD_report["Account"].isnull()]
# remove above lines from WD_report
WD_report = WD_report[~WD_report["Account"].isnull()]
# remove duplicate categories, effectively leaving the first found acc # for a given category, which is what is going to be assigned for missing values
deduplicated_accounts_file = accounts_file.drop_duplicates(subset=["Expense Item name"])
accounts = pd.DataFrame(deduplicated_accounts_file["Account"])
# dropping Account column so that merge does not produce useless new columns
#deduplicated_accounts_file.drop("Account", axis=1, inplace=True)
#accounts.rename(columns={"Account": "Acc#"}, inplace=True)
lines_with_missing_account.drop("Account", axis=1, inplace=True)
merged = lines_with_missing_account.merge(accounts, left_on=lines_with_missing_account["Expense Item"],
right_on=deduplicated_accounts_file["Expense Item name"], how="left")
WD_report = WD_report.append(merged)
WD_report["Account"] = WD_report["Account"].map(int)
# add a checksum so we can group by BA + PC combinations
WD_report["Checksum"] = WD_report["Profit Center"].astype(str) + WD_report["Business Area"]
WD2_report["Checksum"] = WD2_report["Profit Center"].astype(str) + WD2_report["Business Area"]
# restore column order after df.append()
final_column_order = ["Entity Code", "Cost Center", "Expense Report Number", "Expense Item", "Net Amount LC", "Account", "Business Area", "Profit Center", "Functional Area", "MRU", "Checksum"]
WD_report = WD_report.reindex(columns=final_column_order)
# add the generic account number to all card expenses
WD2_report["Account"] = generic_GL_account
# drop currency column, rename WD2's columns to match WD's, and append it
WD1_to_WD2_columns_mapping = {"Transaction ID": "Expense Report Number", "Billing Amount": "Net Amount LC",
"Report Cost Location": "Cost Center", "Company code": "Entity Code"}
WD2_report.drop("Currency", axis=1, inplace=True)
WD2_report.rename(columns=WD1_to_WD2_columns_mapping, inplace=True)
WD_report = WD_report.append(WD2_report)
# rename the column with card transaction numbers back to a senslible name
WD2_report.rename(columns={"Expense Report Number": "Transaction number"}, inplace=True)
In [12]:
# chdir(path.dirname(argv[0]))
# getcwd()
In [29]:
WD_report, WD2_report, MF_JE_template, master_data_file, accounts_file = load_all()
In [30]:
initial_cleanup()
In [31]:
run_vlookups()
In [32]:
final_cleanup()
In [33]:
wrangled_WD_report_save_path = "../Script/Output/wrangled_reports/" + wrangled_WD_report_name
wrangled_WD2_report_save_path = "../Script/Output/wrangled_reports/" + wrangled_WD2_report_name
In [34]:
WD_report.to_csv(wrangled_WD_report_save_path, index=False)
In [35]:
WD2_report.to_csv(wrangled_WD2_report_save_path, index=False)
In [36]:
from urllib.request import urlopen
from json import loads
In [37]:
wrangled_WD_report = pd.read_csv("../Script/Output/wrangled_reports/{}".format(wrangled_WD_report_name))
WD_report_groupby_input = wrangled_WD_report[["Entity Code", "Checksum", "Account", "Expense Report Number", "Net Amount LC", "MRU", "Functional Area"]]
grouped_by_cc = WD_report_groupby_input.groupby("Entity Code", as_index=False)
JE_csv_columns = ["ACCOUNT", "DEBIT", "CREDIT", "TAX CODE", "LINE MEMO", "MRU", "BUSINESS AREA", "PROFIT CENTER", "FUNCTIONAL AREA",
"DATE", "POSTING PERIOD", "ACCOUNTING BOOK", "SUBSIDIARY", "CURRENCY", "MEMO", "REVERSAL DATE", "TO SUBSIDIARY",
"TRADING PARTNER", "TRADING PARTNER CODE", "UNIQUE ID"]
last_day_of_previous_month = pd.to_datetime("today") - pd.tseries.offsets.MonthEnd(1)
date_cut = last_day_of_previous_month.strftime("%m.%y")
first_day_of_current_month = pd.to_datetime("today").replace(day=1).strftime("%m/%d/%Y")
AP_account = 25702400 # the account from which the money will flow
In [38]:
def generate_exchange_rates():
# https://openexchangerates.org
exchange_rates_api_key = "11f20df062814531be891cc0173702a6"
api_call = f"https://openexchangerates.org/api/latest.json?app_id={exchange_rates_api_key}"
rates_api_response = urlopen(api_call)
rates_api_response_str = rates_api_response.read().decode("ascii")
rates_api_response_dict = loads(rates_api_response_str)
rates = rates_api_response_dict["rates"]
# feel free to update company codes/currencies
currencies_in_scope = {"AUSA": "AUD", "BESA": "EUR", "BGSA": "BGN", "BRSA": "BRL", "CASA": "CAD", "CHSD": "CHF", "CNSA": "CNY",
"CRSB": "CRC", "CZSA": "CZK", "DESA": "EUR", "DKSA": "DKK", "ESSA": "EUR", "FRSA": "EUR", "GBF0": "USD",
"GBSA": "GBP", "IESA": "EUR", "IESB": "EUR", "ILSA": "ILS", "ILSB": "ILS", "INSA": "INR", "INSB": "INR",
"INSD": "INR", "ITSA": "EUR", "JPSA": "JPY", "LUSB": "EUR", "MXSC": "MXN", "NLSC": "EUR", "PHSB": "PHP",
"PLSA": "PLN", "PRSA": "PYG", "ROSA": "RON", "RUSA": "RUB", "SESA": "SEK", "TRSA": "TRY", "USMS": "USD",
"USSM": "USD", "USSN": "USD"}
exchange_rates_to_usd = {}
for company_code in currencies_in_scope:
currency = currencies_in_scope[company_code]
# the rates from API are from USD to x; we need from x to USD
try:
exchange_rate_to_usd = 1/rates[currency]
except:
continue
if company_code in exchange_rates_to_usd:
continue
else:
exchange_rates_to_usd[company_code] = exchange_rate_to_usd
return exchange_rates_to_usd
In [23]:
def generate_output(cc): # cc = Company Code
CSV_file_path = "../Script/Output/upload_to_Netsuite/{}_Accrual_WD_{}.csv".format(cc, date_cut)
MF_JE_template_save_path = "../Script/Output/upload_to_Sharepoint/{}_{}{}".format(cc, MF_JE_template_name[:-5], ".xlsx")
JE_csv = pd.DataFrame(columns=JE_csv_columns)
cur_cc_data = grouped_by_cc.get_group(cc)
grouped_by_checksum = cur_cc_data.groupby(["Checksum"])
posting_month = last_day_of_previous_month.strftime("%b")
posting_year = last_day_of_previous_month.strftime("%Y")
posting_period = "{} {}".format(posting_month, posting_year)
# this is a way to track row number, so that groups can be input to consecutive rows
cur_group_start_row = 0
for checksum, g in grouped_by_checksum:
business_area = checksum[-4:] # BA is the last 4 chars of checksum
profit_center = checksum[:5] # PC is the first 5 chars of checksum
general_description = "WD {} ACCRUALS {} FY{}".format(cc, posting_month, posting_year)
for i in range(cur_group_start_row, cur_group_start_row + len(g)):
# for each line for a given checksum (BA and PC combination), retrieve its Acc# culumn value and input it
# into the next free cell in the "ACCOUNT" column in the JE csv form
JE_csv.loc[i, "ACCOUNT"] = g.iloc[i - cur_group_start_row]["Account"]
JE_csv.loc[i, "DEBIT"] = g.iloc[i - cur_group_start_row]["Net Amount LC"]
JE_csv.loc[i, "LINE MEMO"] = g.iloc[i - cur_group_start_row]["Expense Report Number"] + " Accrual"
# Note that even though the template has a TRANSACTION DATE - DAY field, it still passes the whole date in mm/dd/YYYY format
JE_csv.loc[i, "DATE"] = last_day_of_previous_month.strftime("%m/%d/%Y")
JE_csv.loc[i, "POSTING PERIOD"] = posting_period
JE_csv.loc[i, "SUBSIDIARY"] = cc
JE_csv.loc[i, "MEMO"] = general_description
JE_csv.loc[i, "REVERSAL DATE"] = first_day_of_current_month
JE_csv.loc[i, "MRU"] = g.iloc[i - cur_group_start_row]["MRU"]
JE_csv.loc[i, "FUNCTIONAL AREA"] = g.iloc[i - cur_group_start_row]["Functional Area"]
# here we're filling out the AP account row
last_group_start_row = cur_group_start_row
cur_group_start_row += len(g)
JE_csv.loc[cur_group_start_row, "ACCOUNT"] = AP_account
JE_csv.loc[cur_group_start_row, "CREDIT"] = JE_csv.loc[last_group_start_row:cur_group_start_row, "DEBIT"].sum()
JE_csv.loc[cur_group_start_row, "LINE MEMO"] = general_description
JE_csv.loc[cur_group_start_row, "BUSINESS AREA"] = business_area
JE_csv.loc[cur_group_start_row, "PROFIT CENTER"] = profit_center
JE_csv.loc[cur_group_start_row, "DATE"] = last_day_of_previous_month.strftime("%m/%d/%Y")
JE_csv.loc[cur_group_start_row, "POSTING PERIOD"] = posting_period
JE_csv.loc[cur_group_start_row, "SUBSIDIARY"] = cc
JE_csv.loc[cur_group_start_row, "MEMO"] = general_description
JE_csv.loc[cur_group_start_row, "REVERSAL DATE"] = first_day_of_current_month
cur_group_start_row += 1
JE_amount_local = JE_csv["CREDIT"].sum(skipna=True)
exchange_rates = generate_exchange_rates()
amount_in_usd = JE_amount_local * exchange_rates[cc]
to_generate = []
# company requirement
if amount_in_usd > 5000:
to_generate.append(cc)
if cc in to_generate:
JE_csv.to_csv(CSV_file_path, index=False)
print("{} CSV file generated :)".format(cc))
# TODO: since wb.save() closes the workbook, we would need to reopen it on each loop... hence doing it
# with the deeper openpyxl.writer.excel.ExcelWriter's write_data() method
archive = ZipFile(MF_JE_template_save_path,'w', ZIP_DEFLATED, allowZip64=True)
writer = oxl.writer.excel.ExcelWriter(MF_JE_template, archive)
#writer._comments = [] TODO: do this for each sheet
writer.write_data()
#MF_JE_template.save(MF_JE_template_save_path)
print("{} template file generated :)".format(cc))
MF_JE_template.close()
In [39]:
for key, group in grouped_by_cc:
company_code = key
generate_output(company_code)
In [48]:
In [ ]: