notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt



In [5]:

    
full_data_path = '../../data/raw/UCB.csv'
data_path = '/Users/choldgraf/Google Drive/Projects/BIDS/Sourcing/data/UCB_cleaned.csv'
dept_path = '/Users/choldgraf/Google Drive/Projects/BIDS/Sourcing/data/BerkeleyPO_Department.csv'



In [6]:

    
output_path = '../data/cleaned/UCB_cleaned.csv'
merged_path = '../data/cleaned/UCB_dept_merge.csv'



In [7]:

    
full_data = pd.read_csv(full_data_path)









    



/Users/choldgraf/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (12,51,53,55,58,59,66,94,100,101,102,107,109,139) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)



In [8]:

    
# Subset of columns
keep_columns = ['PO ID',
                'PO #',
                'Creation Date',
                'Supplier Name',
                'Item Type',
                'Product Description',
                'Manufacturer',
                'Quantity',
                'Unit Price',
                'Department',
                'Buyer: First Name',
                'Buyer: Last Name',
                'PO Closed Date']
data = full_data[keep_columns]



In [9]:

    
# Replace problematic characters in column names
def convert_strings_to_specials(s):
    s = s.replace(' ', '_')
    s = s.replace(':', '_')
    s = s.replace('#', 'num')
    s = s.lower()
    return s
data.columns = [convert_strings_to_specials(col) for col in data.columns]



In [10]:

    
# Remove rows with some percentage null values
null_perc_cutoff = .5
ix_null = data.isnull().sum(1) < np.round(null_perc_cutoff * data.shape[1])
data = data.loc[ix_null]



In [13]:

    
# Turn column values into correct dtype
convert_int = ['po_id', 'quantity']
convert_float = ['unit_price']
convert_date = ['po_closed_date', 'creation_date']

data[convert_int] = data[convert_int].astype(np.int64)
data[convert_float] = data[convert_float].replace(',', '', regex=True).astype(float)



In [ ]:

    
# Convert dates (this may take some time)
data[convert_date] = data[convert_date].apply(pd.to_datetime)



In [ ]:

    
data.to_csv(output_path, index=False)

Merge with department



In [378]:

    
# Department ID
dept = pd.read_csv(dept_path)
dept['Spend'] = dept['Spend'].replace(',', '', regex=True).astype(float)
dept = dept.sort('Spend', ascending=False)



In [379]:

    
# Clean up the department data
dept['Spend'] = dept.Spend.replace(',', '', regex=True).astype(float)
rename_dict = {col: ''.join([c for c in col if c.isalnum()]) for col in dept.columns}
dept = dept.rename(columns=rename_dict)
dept = dept.dropna(axis=0, subset=['PONumber'])
dept = dept[dept.PONumber != 'Null']



In [380]:

    
merged = pd.merge(data, dept, left_on='po_num', right_on='PONumber', how='inner')



In [383]:

    
merged.to_csv(merged_path, index=False)



In [ ]: