First, we have to perform a lot of wrangling on the reports generated by Workday


In [27]:
import pandas as pd
import openpyxl as oxl
from subprocess import call
from os import remove, chdir, path, getcwd
from io import BytesIO
from sys import argv
from zipfile import ZipFile, ZIP_DEFLATED
import lxml

In [3]:
# TODO: move this to .settings to add clarity for future editor

WD_report_name = "EXP031-RPT-Process-Accruals_with_Expense_Report.xlsx"
WD2_report_name = "EXP032-RPT-Process-Accruals-_No_Expense.xlsx"
wrangled_WD_report_name = "EXP031_plus_EXP032_wrangled.csv"
wrangled_WD2_report_name = "EXP032_wrangled.csv"
MF_JE_template_name = "MF_JE_template.xlsx"
master_file_name = "WD_Accruals_Master.xlsx"
input_folder_path = "../Script/Input/"
generic_GL_account = 46540000

In [4]:
def generate_vbs_script():
    # this VBS script converts XLSX files to CSV format for faster processing
    vbscript = """if WScript.Arguments.Count < 3 Then
        WScript.Echo "Please specify the source and the destination files. Usage: ExcelToCsv <xls/xlsx source file> <csv destination file> <worksheet number (starts at 1)>"
        Wscript.Quit
    End If

    csv_format = 6

    Set objFSO = CreateObject("Scripting.FileSystemObject")

    src_file = objFSO.GetAbsolutePathName(Wscript.Arguments.Item(0))
    dest_file = objFSO.GetAbsolutePathName(WScript.Arguments.Item(1))
    worksheet_number = CInt(WScript.Arguments.Item(2))

    Dim oExcel
    Set oExcel = CreateObject("Excel.Application")

    Dim oBook   
    Set oBook = oExcel.Workbooks.Open(src_file)
    oBook.Worksheets(worksheet_number).Activate

    oBook.SaveAs dest_file, csv_format

    oBook.Close False
    oExcel.Quit
    """
    try:
        with open("ExcelToCsv.vbs", "wb") as f:
            f.write(vbscript.encode("utf-8"))
    except Exception as e:
        print(e.msg)
        print("VBS script for converting xlsx files to csv could not be generated.")

In [5]:
def load_csv(xlsx_file_path, has_sheets=False, skiprows=None, usecols=None):
    # this function maps the generate_vbs_script() function to the input XLSX file
    if has_sheets:
        # sheet numbers to use; using the first three and I don't know how to retrieve no of sheets, hence the fixed numbers
        sheets = map(str, range(1, 3))
        sheet_dataframes = []
        for sheet in sheets:
            csv_file_path = "../Script/{}{}{}".format(input_folder_path, sheet, ".csv")
            call(["cscript.exe", "../Script/ExcelToCsv.vbs", xlsx_file_path, csv_file_path, sheet, r"//B"])
            try:
                sheet_dataframe = pd.read_csv(csv_file_path, encoding="latin-1", engine="c", usecols=usecols)
            except Exception as e:
                print(e.msg)
                print("Sheets could not be converted to CSV format.")
            sheet_dataframes.append(sheet_dataframe)
        return tuple(sheet_dataframes)
    else:
        csv_file_path = "{}{}".format(xlsx_file_path[:-4], "csv")
        # //B is for batch mode; this is to avoid spam on the console :)
        call(["cscript.exe", "../Script/ExcelToCsv.vbs", xlsx_file_path, csv_file_path, str(1), r"//B"])
        if skiprows:
            try:
                data = pd.read_csv(csv_file_path, skiprows=skiprows, encoding="latin-1", engine="c", usecols=usecols)
            except Exception as e:
                print(e.msg)
                print("Something went wrong... make sure report names weren't changed or debug the load_csv function")
        else:
            try:
                    data = pd.read_csv(csv_file_path, encoding="latin-1", engine="c", usecols=usecols)
            except Exception as e:
                print(e.msg)
                print("Something went wrong... make sure report names weren't changed or debug the load_csv function")
        return data

In [28]:
def load_all():
    file_names = [WD_report_name, WD2_report_name, MF_JE_template_name, master_file_name]
    dataframes = []
    WD1_required_cols = ["Entity Code", "Cost Center", "Expense Report Number", "Expense Item", "Net Amount LC"]
    WD2_required_cols = ["Transaction ID", "Billing Amount", "Currency", "Report Cost Location"]

    # the script will be used by load_csv() to convert XLSX to CSV for faster processing
    generate_vbs_script()

    for file_name in file_names:
        if file_name == WD_report_name:
            usecols = WD1_required_cols
            skiprows = [0]
        elif file_name == WD2_report_name:
            usecols = WD2_required_cols
        elif file_name == MF_JE_template_name:
            with open("{}{}".format(input_folder_path, MF_JE_template_name), "rb") as f:
                in_mem_file = BytesIO(f.read())
            MF_JE_template = oxl.load_workbook(in_mem_file)
            dataframes.append(MF_JE_template)
            continue
        else:
            # this will produce two CSVs from the two first two sheets of WD_Accruals_Master.xlsm
            cc_to_ba, accounts = load_csv("{}{}".format(input_folder_path, file_name), has_sheets=True)
            dataframes.extend([cc_to_ba, accounts])
            break
        df = load_csv("{}{}".format(input_folder_path, file_name), skiprows=skiprows, usecols=usecols)
        dataframes.append(df)
        # resetting params
        usecols = skiprows = None
    return dataframes

In [7]:
def collect_garbage():
    # remove no longer needed files
    WD_report_byproduct = "{}{}".format(WD_report_name[:-5], ".csv")
    WD2_report_byproduct = "{}{}".format(WD2_report_name[:-5], ".csv")
    excel_to_csv_macro_byproducts = ["1.csv", "2.csv", WD_report_byproduct, WD2_report_byproduct]
    for byproduct in excel_to_csv_macro_byproducts:
        remove("{}{}".format(input_folder_path, byproduct))
    remove("ExcelToCsv.vbs")

In [8]:
def initial_cleanup():
    # TODO: deal with scientific-notation-like business areas converting to sci-notation
    global WD_report, WD2_report

    collect_garbage()

    # remove rows with total amount 0 or less / unfortunately, pandas nor Python are able to convert amounts in the format:
    # 123,456.00 to float, hence need to either use localization (bad idea as the process is WW), or use below workaround
    try:
        WD_report["Net Amount LC"] = WD_report["Net Amount LC"].apply(lambda x: x.replace(",", "") if type(x) != float else x)
    except:
        pass
    WD_report["Net Amount LC"] = WD_report["Net Amount LC"].map(float)
    WD_report = WD_report[WD_report["Net Amount LC"] > 0]
    try:
        WD2_report["Billing Amount"] = WD2_report["Billing Amount"].apply(lambda x: x.replace(",", "") if type(x) != float else x)
    except:
        pass
    # for card expenses, negative amounts are put in parentheses, e.g. (100.00); below line removes lines with such amounts
    WD2_report = WD2_report[WD2_report["Billing Amount"].apply(lambda x: "(" not in x)]
    WD2_report["Billing Amount"] = WD2_report["Billing Amount"].map(float)
    # filer out lines with missing cost center/cost location, as this data is critical to generating an accrual
    WD_report.dropna(subset=["Cost Center"], inplace=True)
    WD2_report.dropna(subset=["Report Cost Location"], inplace=True)
    # delete the duplicate cost centers/descriptions inside Cost Center/Cost Location column
    WD_report["Cost Center"] = WD_report["Cost Center"].astype("str").map(lambda x: x.split()[0])
    WD2_report["Report Cost Location"] = WD2_report["Report Cost Location"].astype("str").map(lambda x: x.split()[0])
    # add "Company code" column as it will be used by generate_output() to generate a separate folder for each company code
    WD2_report["Company code"] = WD2_report["Report Cost Location"].apply(lambda x: x[:4])
    WD_report = WD_report[WD_report["Expense Report Number"].apply(lambda x: "Cancelled" not in x)]

In [9]:
def vlookup(report, what, left_on, right_on):
    merged = report.merge(what, left_on=left_on, right_on=right_on, how="left")
    return merged

In [10]:
def run_vlookups():
    global WD_report, WD2_report
    accounts = pd.DataFrame(accounts_file["Account"]).astype(int)
    master_data_to_join = master_data_file[["Business Area", "Profit Center", "MRU", "Functional Area"]]

    WD_report = vlookup(WD_report, accounts, left_on=[WD_report["Expense Item"], WD_report["Entity Code"]], right_on=[accounts_file["Expense Item name"], accounts_file["Subsidiary"]])
    # the account number is provided separately for each country. However, all countries have the same account for a given category, so we need to remove these duplicate rows.
    # in case any country has a separate account for a given category in the future, the script will still work
    WD_report = vlookup(WD_report, master_data_to_join, WD_report["Cost Center"], master_data_file["Cost Center"])
    WD2_report = vlookup(WD2_report, master_data_to_join, WD2_report["Report Cost Location"], master_data_file["Cost Center"])

In [11]:
def final_cleanup():
    global WD_report
    global accounts_file
    travel_journal_item_account = 46540000
    company_celebration_account = 46900000
    german_debit_account = 46920000

    # add vlookup exceptions
    no_of_items = WD_report.shape[0]
    for row_index in range(no_of_items):
        category = str(WD_report["Expense Item"].iloc[row_index])  # for some reason this column is loaded as float, hence the str()
        if "Travel Journal Item" in category:
            WD_report.loc[row_index, "Account"] = travel_journal_item_account
            # WD_report.set_value(index, "Acc#", travel_journal_item_account)
        if "Company Celebration" in category:
            # WD_report.set_value(index, "Acc#", company_celebration_account)
            WD_report.loc[row_index, "Account"] = company_celebration_account

    # controllership requirement: change all 9999 BA's to 1019, 1059 to 1015
    WD_report["Business Area"] = WD_report["Business Area"].apply(lambda x: "1019" if str(x) == "9999" else x)
    WD_report["Business Area"] = WD_report["Business Area"].apply(lambda x: "1015" if str(x) == "1059" else x)

    # this is to stop Excel from reading e.g. 2E00 as a number in scientific notation
    WD_report["Business Area"] = WD_report["Business Area"].map(str)

    # note that this also overrides the above two exceptions, which are changed to the german account
    WD_report.loc[WD_report["Entity Code"] == "DESA", "Account"] = german_debit_account

    # ensure that account number is provided, and that it is an integer
    try:
        WD_report["Account"] = WD_report["Account"].map(int)
    except:
        # this means that some account numbers were not found for a company code-category combination -> use an account
        # for the same category, but another company code (all CCs should use the same account)
        lines_with_missing_account = WD_report[WD_report["Account"].isnull()]
        # remove above lines from WD_report
        WD_report = WD_report[~WD_report["Account"].isnull()]
        # remove duplicate categories, effectively leaving the first found acc # for a given category, which is what is going to be assigned for missing values
        deduplicated_accounts_file = accounts_file.drop_duplicates(subset=["Expense Item name"])
        accounts = pd.DataFrame(deduplicated_accounts_file["Account"])
        # dropping Account column so that merge does not produce useless new columns
        #deduplicated_accounts_file.drop("Account", axis=1, inplace=True)
        #accounts.rename(columns={"Account": "Acc#"}, inplace=True)
        lines_with_missing_account.drop("Account", axis=1, inplace=True)
        merged = lines_with_missing_account.merge(accounts, left_on=lines_with_missing_account["Expense Item"],
                                                  right_on=deduplicated_accounts_file["Expense Item name"], how="left")
        WD_report = WD_report.append(merged)
        WD_report["Account"] = WD_report["Account"].map(int)

    # add a checksum so we can group by BA + PC combinations
    WD_report["Checksum"] = WD_report["Profit Center"].astype(str) + WD_report["Business Area"]
    WD2_report["Checksum"] = WD2_report["Profit Center"].astype(str) + WD2_report["Business Area"]
    
    # restore column order after df.append()
    final_column_order = ["Entity Code", "Cost Center", "Expense Report Number", "Expense Item", "Net Amount LC", "Account", "Business Area", "Profit Center", "Functional Area", "MRU", "Checksum"]
    WD_report = WD_report.reindex(columns=final_column_order)
    
    # add the generic account number to all card expenses
    WD2_report["Account"] = generic_GL_account
    
    # drop currency column, rename WD2's columns to match WD's, and append it
    WD1_to_WD2_columns_mapping = {"Transaction ID": "Expense Report Number", "Billing Amount": "Net Amount LC", 
                                  "Report Cost Location": "Cost Center", "Company code": "Entity Code"}
    WD2_report.drop("Currency", axis=1, inplace=True)
    WD2_report.rename(columns=WD1_to_WD2_columns_mapping, inplace=True)
    WD_report = WD_report.append(WD2_report)
    
    # rename the column with card transaction numbers back to a senslible name
    WD2_report.rename(columns={"Expense Report Number": "Transaction number"}, inplace=True)

Add below cell to the final code; ensure getcwd() return correct path


In [12]:
# chdir(path.dirname(argv[0]))
# getcwd()

In [29]:
WD_report, WD2_report, MF_JE_template, master_data_file, accounts_file = load_all()


C:\syf\Anaconda\lib\site-packages\openpyxl\reader\worksheet.py:310: UserWarning: Conditional Formatting extension is not supported and will be removed
  warn(msg)
C:\syf\Anaconda\lib\site-packages\openpyxl\reader\worksheet.py:310: UserWarning: Data Validation extension is not supported and will be removed
  warn(msg)

In [30]:
initial_cleanup()

In [31]:
run_vlookups()

In [32]:
final_cleanup()

In [33]:
wrangled_WD_report_save_path = "../Script/Output/wrangled_reports/" + wrangled_WD_report_name
wrangled_WD2_report_save_path = "../Script/Output/wrangled_reports/" + wrangled_WD2_report_name

In [34]:
WD_report.to_csv(wrangled_WD_report_save_path, index=False)

In [35]:
WD2_report.to_csv(wrangled_WD2_report_save_path, index=False)

Now let's use these wrangled files to generate CSVs in the format accepted by Netsuite


In [36]:
from urllib.request import urlopen
from json import loads

In [37]:
wrangled_WD_report = pd.read_csv("../Script/Output/wrangled_reports/{}".format(wrangled_WD_report_name))
WD_report_groupby_input = wrangled_WD_report[["Entity Code", "Checksum", "Account", "Expense Report Number", "Net Amount LC", "MRU", "Functional Area"]]
grouped_by_cc = WD_report_groupby_input.groupby("Entity Code", as_index=False)
JE_csv_columns = ["ACCOUNT", "DEBIT", "CREDIT", "TAX CODE", "LINE MEMO", "MRU", "BUSINESS AREA", "PROFIT CENTER", "FUNCTIONAL AREA",
                  "DATE", "POSTING PERIOD", "ACCOUNTING BOOK", "SUBSIDIARY", "CURRENCY", "MEMO", "REVERSAL DATE", "TO SUBSIDIARY",
                  "TRADING PARTNER", "TRADING PARTNER CODE", "UNIQUE ID"]
last_day_of_previous_month = pd.to_datetime("today") - pd.tseries.offsets.MonthEnd(1)
date_cut = last_day_of_previous_month.strftime("%m.%y")
first_day_of_current_month = pd.to_datetime("today").replace(day=1).strftime("%m/%d/%Y")
AP_account = 25702400  # the account from which the money will flow

In [38]:
def generate_exchange_rates():
    # https://openexchangerates.org
    exchange_rates_api_key = "11f20df062814531be891cc0173702a6"
    api_call = f"https://openexchangerates.org/api/latest.json?app_id={exchange_rates_api_key}"
    rates_api_response = urlopen(api_call)
    rates_api_response_str = rates_api_response.read().decode("ascii")
    rates_api_response_dict = loads(rates_api_response_str)
    rates = rates_api_response_dict["rates"]

    # feel free to update company codes/currencies
    currencies_in_scope = {"AUSA": "AUD", "BESA": "EUR", "BGSA": "BGN", "BRSA": "BRL", "CASA": "CAD", "CHSD": "CHF", "CNSA": "CNY",
                           "CRSB": "CRC", "CZSA": "CZK", "DESA": "EUR", "DKSA": "DKK", "ESSA": "EUR", "FRSA": "EUR", "GBF0": "USD",
                           "GBSA": "GBP", "IESA": "EUR", "IESB": "EUR", "ILSA": "ILS", "ILSB": "ILS", "INSA": "INR", "INSB": "INR",
                           "INSD": "INR", "ITSA": "EUR", "JPSA": "JPY", "LUSB": "EUR", "MXSC": "MXN", "NLSC": "EUR", "PHSB": "PHP",
                           "PLSA": "PLN", "PRSA": "PYG", "ROSA": "RON", "RUSA": "RUB", "SESA": "SEK", "TRSA": "TRY", "USMS": "USD",
                           "USSM": "USD", "USSN": "USD"}

    exchange_rates_to_usd = {}
    for company_code in currencies_in_scope:
        currency = currencies_in_scope[company_code]
        # the rates from API are from USD to x; we need from x to USD
        try:
            exchange_rate_to_usd = 1/rates[currency]
        except:
            continue
        if company_code in exchange_rates_to_usd:
            continue
        else:
            exchange_rates_to_usd[company_code] = exchange_rate_to_usd
    return exchange_rates_to_usd

In [23]:
def generate_output(cc):  # cc = Company Code

    CSV_file_path = "../Script/Output/upload_to_Netsuite/{}_Accrual_WD_{}.csv".format(cc, date_cut)
    MF_JE_template_save_path = "../Script/Output/upload_to_Sharepoint/{}_{}{}".format(cc, MF_JE_template_name[:-5], ".xlsx")
    JE_csv = pd.DataFrame(columns=JE_csv_columns)
    cur_cc_data = grouped_by_cc.get_group(cc)
    grouped_by_checksum = cur_cc_data.groupby(["Checksum"])
    posting_month = last_day_of_previous_month.strftime("%b")
    posting_year = last_day_of_previous_month.strftime("%Y")
    posting_period = "{} {}".format(posting_month, posting_year)
    # this is a way to track row number, so that groups can be input to consecutive rows
    cur_group_start_row = 0

    for checksum, g in grouped_by_checksum:
        business_area = checksum[-4:]  # BA is the last 4 chars of checksum
        profit_center = checksum[:5]  # PC is the first 5 chars of checksum
        general_description = "WD {} ACCRUALS {} FY{}".format(cc, posting_month, posting_year)
        for i in range(cur_group_start_row, cur_group_start_row + len(g)):
            # for each line for a given checksum (BA and PC combination), retrieve its Acc# culumn value and input it
            # into the next free cell in the "ACCOUNT" column in the JE csv form
            JE_csv.loc[i, "ACCOUNT"] = g.iloc[i - cur_group_start_row]["Account"]
            JE_csv.loc[i, "DEBIT"] = g.iloc[i - cur_group_start_row]["Net Amount LC"]
            JE_csv.loc[i, "LINE MEMO"] = g.iloc[i - cur_group_start_row]["Expense Report Number"] + " Accrual"
            # Note that even though the template has a TRANSACTION DATE - DAY field, it still passes the whole date in mm/dd/YYYY format
            JE_csv.loc[i, "DATE"] = last_day_of_previous_month.strftime("%m/%d/%Y")
            JE_csv.loc[i, "POSTING PERIOD"] = posting_period
            JE_csv.loc[i, "SUBSIDIARY"] = cc
            JE_csv.loc[i, "MEMO"] = general_description
            JE_csv.loc[i, "REVERSAL DATE"] = first_day_of_current_month
            JE_csv.loc[i, "MRU"] = g.iloc[i - cur_group_start_row]["MRU"]
            JE_csv.loc[i, "FUNCTIONAL AREA"] = g.iloc[i - cur_group_start_row]["Functional Area"]

        # here we're filling out the AP account row
        last_group_start_row = cur_group_start_row
        cur_group_start_row += len(g)
        JE_csv.loc[cur_group_start_row, "ACCOUNT"] = AP_account
        JE_csv.loc[cur_group_start_row, "CREDIT"] = JE_csv.loc[last_group_start_row:cur_group_start_row, "DEBIT"].sum()
        JE_csv.loc[cur_group_start_row, "LINE MEMO"] = general_description
        JE_csv.loc[cur_group_start_row, "BUSINESS AREA"] = business_area
        JE_csv.loc[cur_group_start_row, "PROFIT CENTER"] = profit_center
        JE_csv.loc[cur_group_start_row, "DATE"] = last_day_of_previous_month.strftime("%m/%d/%Y")
        JE_csv.loc[cur_group_start_row, "POSTING PERIOD"] = posting_period
        JE_csv.loc[cur_group_start_row, "SUBSIDIARY"] = cc
        JE_csv.loc[cur_group_start_row, "MEMO"] = general_description
        JE_csv.loc[cur_group_start_row, "REVERSAL DATE"] = first_day_of_current_month
        cur_group_start_row += 1

    JE_amount_local = JE_csv["CREDIT"].sum(skipna=True)
    exchange_rates = generate_exchange_rates()
    amount_in_usd =  JE_amount_local * exchange_rates[cc]
    to_generate = []

    # company requirement
    if amount_in_usd > 5000:
        to_generate.append(cc)

    if cc in to_generate:
        JE_csv.to_csv(CSV_file_path, index=False)
        print("{} CSV file generated :)".format(cc))
        # TODO: since wb.save() closes the workbook, we would need to reopen it on each loop... hence doing it 
        # with the deeper openpyxl.writer.excel.ExcelWriter's write_data() method
        archive = ZipFile(MF_JE_template_save_path,'w', ZIP_DEFLATED, allowZip64=True)
        writer = oxl.writer.excel.ExcelWriter(MF_JE_template, archive)
        #writer._comments = [] TODO: do this for each sheet
        writer.write_data()
        #MF_JE_template.save(MF_JE_template_save_path)
        print("{} template file generated :)".format(cc))
    MF_JE_template.close()

In [39]:
for key, group in grouped_by_cc:
    company_code = key
    generate_output(company_code)


AUSA CSV file generated :)
AUSA template file generated :)
DESA CSV file generated :)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-39-4f4d12b65ffe> in <module>()
      1 for key, group in grouped_by_cc:
      2     company_code = key
----> 3     generate_output(company_code)

<ipython-input-23-e123e78b7840> in generate_output(cc)
     63         writer = oxl.writer.excel.ExcelWriter(MF_JE_template, archive)
     64         #writer._comments = [] TODO: do this for each sheet
---> 65         writer.write_data()
     66         #MF_JE_template.save(MF_JE_template_save_path)
     67         print("{} template file generated :)".format(cc))

C:\syf\Anaconda\lib\site-packages\openpyxl\writer\excel.py in write_data(self)
     79             archive.writestr(ARC_THEME, write_theme())
     80 
---> 81         self._write_worksheets()
     82         self._write_chartsheets()
     83         self._write_images()

C:\syf\Anaconda\lib\site-packages\openpyxl\writer\excel.py in _write_worksheets(self)
    209 
    210             if ws._comments:
--> 211                 self._write_comment(ws)
    212 
    213             if ws.legacy_drawing is not None:

C:\syf\Anaconda\lib\site-packages\openpyxl\writer\excel.py in _write_comment(self, ws)
    179             vml = None
    180         else:
--> 181             vml = fromstring(self.workbook.vba_archive.read(ws.legacy_drawing))
    182 
    183         vml = cs.write_shapes(vml)

AttributeError: 'NoneType' object has no attribute 'read'

TODO: generate backup template files using openpyxl:

load_MF_JE_template()

if cc in to_generate:

    ### filled = fill_template(cc, template)
    ### filled.save(path)
    ### 

In [48]:



2 1

In [ ]: