USF Clustering Example


Modules For the Analysis and Create Initial Data


In [1]:
import pandas as pd
from sklearn.cluster import Ward
import matplotlib.pyplot as plt
import numpy as np
from pandas import merge
from sklearn.decomposition import PCA
from pandas import merge
from __future__ import division
from pandas import DataFrame
from pandas.tools.plotting import scatter_matrix


#Note, Spendradar creates the csv and adds something called a "BOM" to the front, impacting the first column header.
#Remove it by opening the csv in Notepad++ and changing the encoding to utf-8 without BOM.  Or just use the following:
# awk '{ if (NR==1) sub(/^\xef\xbb\xbf/,""); print }' Data_2.csv > Data2.csv

#Data is from SpendRadar and has the following format:
#Campus|UNSPSC Level 2|Supplier Normalized Name|Department Name|Org Level 4|Org Level 5|Spend Date|# of Lines|# of POs|# of Invoices

#Set the level of the organization (I reccomend 'Org Level 3')
Org_Level = 'Department Name'

#read in data, strip comma's out of the values and convert them to floats, index by "campus,Org"
#The parse dates option increases the loading time, be patient.
df = pd.read_csv('Data2.csv', 
                 parse_dates=[4],
                 converters={'Spend': lambda x: float(x.replace(',','')),
                              '# of Lines': lambda x: float(x.replace(',','')),
                              '# of POs': lambda x: float(x.replace(',','')),
                              '# of Invoices': lambda x: float(x.replace(',',''))
                              })

supplier_filter = '|'.join(['FISHER SCIENTIFIC',
                            'VWR',
                            'SIGMA ALDRICH',
                            'BIOEXPRESS',
                            'LIFE TECHNOLOGIES CORP',
                            'USA SCIENTIFIC',
                            'DENVILLE',
                            'CLONTECH',
                            'GE HEALTHCARE BIO',
                            'NEW ENGLAND BIO',
                            'SPECTRUM CHEMICAL', 
                            'SPECTRUM LABORATORY',
                            'PROMEGA'
                            ])


#Filter down to the data of interest, in this case by supplier. 
df_filtered = df[df['Supplier Normalized Name'].str.contains(supplier_filter) & (df['Spend'] > 0)]

#find the number of Suppliers in the dataset
N_Suppliers = df_filtered['Supplier Normalized Name'].nunique()

Analyze and vizualize the features using Glue


In [2]:
from glue import qglue

#Create a summary pivot for departments
Dept_Summary = df_filtered.pivot_table(rows=['Campus',Org_Level], values=['Spend','# of POs', '# of Invoices'], aggfunc=np.sum, fill_value=0).reset_index()

#pivot out by campus, department, and date then find the summary stats of the time based dimensions
#Create a frame that is grouped by month and sum of all values
dept_time = df_filtered.pivot_table(rows=['Campus',Org_Level,'Spend Date' ], values=['Spend','# of POs', '# of Invoices'], aggfunc=np.sum, fill_value=0).reset_index()

dept_time = dept_time.groupby(['Campus',Org_Level,dept_time['Spend Date'].map(lambda x: (x.month,x.year))]).sum().reset_index()

avg_monthly = dept_time.groupby(['Campus',Org_Level]).mean()
Count_Months = pd.DataFrame(dept_time.groupby(['Campus',Org_Level]).count()['Spend Date'])

qglue(DepartmentSummary=dept_time.reset_index())


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:\Anaconda\lib\site-packages\glue\qt\widgets\mpl_widget.pyc in _on_timeout(self)
    113             self._resize_timer.start()
    114         else:
--> 115             self.resize_end.emit()
    116 
    117     def paintEvent(self, event):

TypeError: pyqtSignal must be bound to a QObject, not 'MplCanvas'
C:\Anaconda\lib\site-packages\pandas\util\decorators.py:53: FutureWarning: rows is deprecated, use index instead
  warnings.warn(msg, FutureWarning)
Out[2]:
DataCollection (1 data set)
	  0: DepartmentSummary

In [3]:
from numpy import inf

Spend_per_Supplier = df_filtered.pivot_table(rows=['Campus',Org_Level], cols='Supplier Normalized Name', values='Spend', aggfunc=np.sum, fill_value=0) 
test = Spend_per_Supplier
test = pd.DataFrame(np.log(Spend_per_Supplier))
test.fillna(value=0)
test[test == -inf] = 0
test

test.plot(kind='scatter', x='SPECTRUM CHEMICALS & LABORATORY PRODUCTS', y='SIGMA ALDRICH INC')


Out[3]:
<matplotlib.axes.AxesSubplot at 0x1f98b518>

Create the Feature Space


In [2]:
#Create a set of features for the total spend per supplier per department
Spend_per_Supplier = df_filtered.pivot_table(rows=['Campus',Org_Level], cols='Supplier Normalized Name', values='Spend', aggfunc=np.sum, fill_value=0) 



#Create the Invoice Per PO feature
PO_by_Inv = df_filtered.pivot_table(rows=['Campus',Org_Level], values=['# of POs', '# of Invoices'], aggfunc=np.sum, fill_value=0)
cutoff = 20
#Replace # of PO = 0 with a .01 so that we can avoid INF errors
PO_by_Inv.replace(0,1,True)
#Divide
PO_by_Inv['Inv/PO'] = (PO_by_Inv['# of Invoices']/PO_by_Inv['# of POs'])
#Take care of inf
PO_by_Inv['Inv/PO'].replace('inf',cutoff,True)
#Drop unnecessary columns
PO_by_Inv = PO_by_Inv.drop(['# of Invoices','# of POs'],1)
#Filter out the outliers and change them to a "cutoff"
PO_by_Inv['Inv/PO'][PO_by_Inv['Inv/PO'] > cutoff] = cutoff
Log_PO_by_Inv = pd.DataFrame(np.log(PO_by_Inv))



#IS IT POSSIBLE (OR WORTH IT) TO CODE THESE AS 1 LINE FUNCTIONS OFF THE SPEND_PER_SUPPLIER DATAFRAME???????????

#Create the supplier count feature
Count_Supplier= DataFrame(df_filtered.pivot_table(rows=['Campus',Org_Level],values='Supplier Normalized Name',aggfunc=lambda x: len(x.unique())))
Count_Supplier = Count_Supplier.rename(columns={'Supplier Normalized Name':'Count of Suppliers'})

#Create the Spend Diffusion Metric
Max_Supplier_Spend = df_filtered.pivot_table(rows=['Campus',Org_Level,'Supplier Normalized Name'], 
                                          values=['Spend'], aggfunc=np.sum, fill_value=0).reset_index().groupby(['Campus',Org_Level]).max().drop(['Supplier Normalized Name'],1)

Total_Dept_Spend = df_filtered.pivot_table(rows=['Campus',Org_Level], 
                                          values=['Spend'], aggfunc=np.sum, fill_value=0)

Spend_Diffusion = 1-(Max_Supplier_Spend/Total_Dept_Spend)
Spend_Diffusion = Spend_Diffusion.rename(columns={'Spend': 'Spend_Diffusion'})
Log_Spend_Diffusion = pd.DataFrame(np.log(Spend_Diffusion))


#Create Spend Frequency and Average Monthly Spend/PO/Invoice Features
Dept_Summary = df_filtered.pivot_table(rows=['Campus',Org_Level], values=['Spend','# of POs', '# of Invoices'], aggfunc=np.sum, fill_value=0).reset_index()
#pivot out by campus, department, and date then find the summary stats of the time based dimensions
#Create a frame that is grouped by month and sum of all values
dept_time = df_filtered.pivot_table(rows=['Campus',Org_Level,'Spend Date' ], values=['Spend','# of POs', '# of Invoices'], aggfunc=np.sum, fill_value=0).reset_index()
dept_time = dept_time.groupby(['Campus',Org_Level,dept_time['Spend Date'].map(lambda x: (x.month,x.year))]).sum().reset_index()
avg_monthly = pd.DataFrame(dept_time.groupby(['Campus',Org_Level]).mean()['Spend'])
Count_Months = pd.DataFrame(dept_time.groupby(['Campus',Org_Level]).count()['Spend Date'])


#Merge all the features into the Department_Analysis Frame (Un-comment to run these)
Department_Analysis = pd.merge(Spend_per_Supplier,PO_by_Inv,left_index=True, right_index=True,how='left')
Department_Analysis = pd.merge(Department_Analysis,Count_Supplier,left_index=True, right_index=True,how='left')
Department_Analysis = pd.merge(Department_Analysis,Spend_Diffusion,left_index=True, right_index=True,how='left').fillna(0)
Department_Analysis = pd.merge(Department_Analysis,Count_Months,left_index=True, right_index=True,how='left').fillna(0)
#Department_Analysis = pd.merge(Department_Analysis,Log_PO_by_Inv,left_index=True, right_index=True,how='left').fillna(0)
#Department_Analysis = pd.merge(Department_Analysis,avg_monthly,left_index=True, right_index=True,how='left').fillna(0)



#For some reason 1/Department_Analysis['Count of Suppliers'] didn't work so I created column of all 1
Department_Analysis['one']=1
Department_Analysis['Optimal Diffusion']= Department_Analysis['one']-(Department_Analysis['one']/Department_Analysis['Count of Suppliers'])
Department_Analysis['% Opt-Actual'] = (Department_Analysis['Optimal Diffusion']-Department_Analysis['Spend_Diffusion'])/Department_Analysis['Optimal Diffusion']
Department_Analysis['% Opt-Actual']=Department_Analysis['% Opt-Actual'].fillna(0)
Department_Analysis[Department_Analysis['Spend_Diffusion']==-inf] = 0

del Department_Analysis['one']
del Department_Analysis['Optimal Diffusion']

#Calculate the number of non-Supplier Spend dimensions
NonSupplier_Features = len(Department_Analysis.columns) - N_Suppliers


#Create a scatterplot matrix to analyze the features
scatter_matrix(Department_Analysis.ix[:,-NonSupplier_Features:], alpha=0.2, figsize=(14, 12))


#This is a standardization of each columnwise data element by subtracting the column mean and dividing by the column standard deviation.
for col in Department_Analysis.columns:
    Department_Analysis[col + '_stdard'] = (Department_Analysis[col] - Department_Analysis[col].mean()) / Department_Analysis[col].std()


How many clusters should we have?


In [13]:
Department_Analysis


Out[13]:
BIOEXPRESS LLC CLONTECH A TAKARA BIO CO CLONTECH LABS INC DENVILLE SCIENTIFIC LIFE TECHNOLOGIES CORP NEW ENGLAND BIOLABS SIGMA ALDRICH INC SPECTRUM CHEMICALS & LABORATORY PRODUCTS SPECTRUM LABORATORY PRODUCTS THERMO FISHER SCIENTIFIC INC USA SCIENTIFIC INC VWR INTERNATIONAL INC Inv/PO Count of Suppliers Spend_Diffusion Spend Date % Opt-Actual BIOEXPRESS LLC_stdard CLONTECH A TAKARA BIO CO_stdard CLONTECH LABS INC_stdard
Campus Department Name
UCB AACHN Calif Hall Bldg Mgmt 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
AACHN Chan Off Immed Office 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
AACHN Univ House Maint & Rep 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
ABCFI AVC CFO Immed Office 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
AIDVO ADM Business Ops/Svcs-DO 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
AIDVO Advancement Info Mgmt 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
AIDVO Ext Relat & Protocol-DO 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
ALPDC DPM Project Management 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
ALPDC Plant Expenditures Proj 0.00 0 0.00 0.00 2250.90 0.00 0.00 1981.10 0.00 10304.49 0.00 1394.37 0.000000 4 -1.040794 5 2.387725 -0.145474 -0.043491 -0.084272
AZCSS Benefits 0.00 0 0.00 0.00 0.00 0.00 0.00 21.51 0.00 4.83 0.00 0.00 0.000000 2 -1.696242 2 4.392484 -0.145474 -0.043491 -0.084272
AZCSS General Admin 0.00 0 0.00 0.00 0.00 0.00 0.00 21.16 0.00 515.19 0.00 433.75 0.046520 3 -0.757300 11 2.135949 -0.145474 -0.043491 -0.084272
BAHSB B&O Schoolwd Services 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB BO Admin Stockroom 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB Behavioral Lab 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB Career Gen Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB Career Recruitment Ctr 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB Dean Gen Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB ECSM Bldg Furnishings 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 300.08 0.00 100.50 0.000000 2 -1.382756 2 3.765512 -0.145474 -0.043491 -0.084272
BAHSB ECSM Bldg Gen Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB ECSM Bldg Maintenance 0.00 0 0.00 0.00 0.00 0.00 0.00 93.20 0.00 224.24 0.00 0.00 0.000000 2 -1.225541 5 3.451082 -0.145474 -0.043491 -0.084272
BAHSB ECSM Lab Svcs 0.00 0 0.00 0.00 0.00 0.00 0.00 114.78 0.00 82.95 0.00 6.10 0.000000 3 -0.828088 5 2.242132 -0.145474 -0.043491 -0.084272
BAHSB ECSM Media Svc 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB Instr EVMBA Gen Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BAHSB Instr UG Gen Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BCHCI UCBEI Research 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BOOPT AA Temp Instruc Costs 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 193.66 0.00 110.70 0.000000 2 -1.011387 2 3.022775 -0.145474 -0.043491 -0.084272
BOOPT DN Trainees 323.27 0 0.00 0.00 4273.69 0.00 0.00 137.57 0.00 910.38 161.01 190.27 0.042560 6 -1.247348 3 2.496817 -0.053842 -0.043491 -0.084272
BOOPT ER Development 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BOOPT RS Other 0.00 0 0.00 0.00 851.63 0.00 197.34 326.83 0.00 5833.15 221.73 12072.99 0.000000 6 -0.964985 3 2.157982 -0.145474 -0.043491 -0.084272
BOOPT RS Post Award Federal 871.09 0 0.00 0.00 10830.16 0.00 1593.77 366.56 0.00 20315.63 0.00 23964.88 -0.043675 6 -0.533754 4 1.640505 0.101438 -0.043491 -0.084272
BOOPT RS Post Award Other 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BOOPT RS Post Award Private 238.28 0 0.00 0.00 0.00 0.00 174.80 253.14 0.00 352.78 0.00 0.00 0.000000 4 -0.424957 4 1.566609 -0.077933 -0.043491 -0.084272
BPOPC GO Clinic Admin & Supp 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BPOPC PC Minor Hall Clinics 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BPOPC PC Refractiv Surgery Ctr 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
BQRES DN Trainees 3209.04 0 0.00 0.00 3343.52 0.00 341.05 481.30 0.00 2796.15 256.68 2071.34 0.000000 7 -0.311294 8 1.363176 0.764135 -0.043491 -0.084272
BQRES RS Other 0.00 0 0.00 288.76 1738.91 179.96 408.49 773.67 0.00 6663.93 116.62 8311.27 -0.063716 8 -0.597301 7 1.682629 -0.145474 -0.043491 -0.084272
BQRES RS Post Award Federal 428.26 0 0.00 106.38 5343.91 0.00 1351.43 32.30 0.00 21819.79 160.09 17511.93 -0.058998 8 -0.628657 8 1.718466 -0.024083 -0.043491 -0.084272
BQRES RS Post Award Private 0.00 0 0.00 0.00 445.22 0.00 464.94 246.23 0.00 813.89 21.76 682.84 0.000000 6 -0.362796 8 1.435355 -0.145474 -0.043491 -0.084272
BRCOE OLE OnlineEdInitiatveOps 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
CCHEM INSTR Ops 100.49 0 0.00 0.00 0.00 0.00 12591.87 16425.41 0.00 140783.62 0.00 63405.19 0.006072 5 -0.924897 12 2.156121 -0.116989 -0.043491 -0.084272
CCHEM RES Research 817.81 0 168.03 9096.17 19745.87 13948.46 151286.27 85347.15 0.00 672521.15 2878.70 352751.08 0.019775 10 -0.721422 12 1.801580 0.086336 -0.043491 -0.051762
CDCDN An CHEXRAY Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 135.77 0.00 214.03 0.00 734.53 0.000000 3 -1.131356 4 2.697034 -0.145474 -0.043491 -0.084272
CDCDN An Microlab Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
CDCDN An NMR Ops 0.00 0 0.00 0.00 0.00 0.00 263.46 711.02 0.00 3997.00 0.00 391.86 0.000000 4 -1.367451 6 2.823268 -0.145474 -0.043491 -0.084272
CDCDN BAS/Mat Mgmt Stores Ops 0.00 0 0.00 0.00 0.00 0.00 12140.27 7108.50 0.00 230882.15 1862.14 61709.87 0.097428 5 -1.331767 12 2.664709 -0.145474 -0.043491 -0.084272
CDCDN Bio-Safety Lab 3 0.00 0 0.00 0.00 319.54 0.00 223.35 270.61 0.00 2259.81 0.00 1665.79 0.000000 5 -0.647875 8 1.809844 -0.145474 -0.043491 -0.084272
CDCDN Chem R&D Engineering Svc 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6049.47 0.00 275.63 0.000000 2 -3.133222 9 7.266443 -0.145474 -0.043491 -0.084272
CDCDN Dean Endowment & Gifts 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 137.77 0.00 943.66 -0.154151 2 -2.060454 3 5.120908 -0.145474 -0.043491 -0.084272
CDCDN Dean Reserve 0.00 0 0.00 0.00 0.00 0.00 0.00 35.84 0.00 19941.24 0.00 15105.63 0.064539 3 -0.840271 4 2.260407 -0.145474 -0.043491 -0.084272
CDCDN E/F Facilities Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 50.91 0.00 150.60 0.00 0.00 0.000000 2 -1.375780 3 3.751559 -0.145474 -0.043491 -0.084272
CDCDN EH&S Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 418.40 0.00 145.01 0.000000 2 -1.357205 3 3.714410 -0.145474 -0.043491 -0.084272
CDCDN Undergrad Aff Ops 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
CEEEG Gen Ops 0.00 0 0.00 0.00 0.00 0.00 72.95 126.00 0.00 266.33 0.00 0.00 0.000000 3 -0.849586 4 2.274379 -0.145474 -0.043491 -0.084272
CEEEG Inst. TAS 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 450.57 0.00 214.06 0.000000 2 -1.132974 3 3.265948 -0.145474 -0.043491 -0.084272
CEEEG RES Research 1114.85 0 465.21 800.75 2253.43 577.45 33524.64 8608.45 1654.65 72709.75 229.03 30868.06 -0.001190 11 -0.645939 12 1.710532 0.170533 -0.043491 0.005735
CITRS Operating Funds 0.00 0 0.00 0.00 0.00 0.00 0.00 84.97 0.00 157.20 20.32 178.29 0.000000 4 -0.518333 9 1.691111 -0.145474 -0.043491 -0.084272
CKGEN OPS Operating Expenses 0.00 0 0.00 0.00 0.00 0.00 0.00 7.74 0.00 18.61 0.00 0.00 0.000000 2 -1.225067 2 3.450133 -0.145474 -0.043491 -0.084272
CLLAW ADM Bud,Plng&AcPersonnel 0.00 0 0.00 0.00 0.00 0.00 94.18 0.00 0.00 0.00 0.00 19.55 0.000000 2 -1.760852 2 4.521704 -0.145474 -0.043491 -0.084272
CLLAW ADM Building Services 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.000000 0 0.000000 0 0.000000 -0.145474 -0.043491 -0.084272
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...

2015 rows × 35 columns


In [163]:
#Use the silhouette score to test the cluster density at different numbers of clusters.
from sklearn import metrics


def ClusterDistance(FeatureMatrix, minRange, maxRange):
    sil=[0]
    for k in range(minRange,maxRange):
        ward = Ward(n_clusters=k)
        FeatureMatrix['category'] = ward.fit_predict(FeatureMatrix)
        #Run the Silouette Score to test the quality of the clusters
        sil.append(metrics.silhouette_score(FeatureMatrix.values, FeatureMatrix['category'].values, metric='euclidean'))
    return sil
    
#Update the input to the function here:
pd.DataFrame(ClusterDistance(Department_Analysis.ix[:,-4:],2,100)).plot(kind='line')


C:\Anaconda\lib\site-packages\numpy\core\_methods.py:55: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)
Out[163]:
<matplotlib.axes.AxesSubplot at 0x43befc50>

Cluster and Observe the Results


In [3]:
#Reduce dimensionality down to 2 axes


#Drop the category column if it exists
for col in Department_Analysis.columns:
    if col is 'category':
        Department_Analysis = Department_Analysis.drop(col,1)
        
        
#Use PCA to reduce dimensionality to 2 to check the variance explained and inspect the clusters
pca = PCA(n_components=3).fit_transform(Department_Analysis.ix[:,-NonSupplier_Features:])
pcaT = PCA(n_components=3).fit(Department_Analysis.ix[:,-NonSupplier_Features:])


#Run the heirarchical clustering
ward = Ward(n_clusters=5)
Department_Analysis['category'] = ward.fit_predict(Department_Analysis.ix[:,-NonSupplier_Features:])

#Run the Silouette Score to test the quality of the clusters
from sklearn.metrics import pairwise_distances
from sklearn import metrics


print('\nSilouette Score: %s' % metrics.silhouette_score(Department_Analysis.ix[:,-NonSupplier_Features:].values, Department_Analysis['category'].values, metric='euclidean'))
print('Variance Explained By Feature: %s' % pcaT.explained_variance_ratio_, sum(pcaT.explained_variance_ratio_))


#Run PCA and chart the two leading Components to see what the clusters look like in 2D space
sc = plt.scatter(pca[:,0],pca[:,1], s=250, c=Department_Analysis.category, alpha=0.5, edgecolors='black')
plt.legend()
plt.show()

#Group the data by supplier and plot it to see the spend per supplier/per cluster.
fig, axes = plt.subplots(nrows=5, ncols=1,figsize=(14, 14))
fig.tight_layout()
fig.subplots_adjust(hspace=.35)
ax1 = Department_Analysis.groupby('category').count()[[1]].plot(kind='bar',ax=axes[0], title='Count of Departments Per Category', label=None)
ax2 = Department_Analysis.groupby('category').sum()[[x for x in Department_Analysis.columns[0:N_Suppliers]]].plot(kind='bar',ax=axes[1], title='Sum of Spend per Supplier, Per Category').legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
ax3 = Department_Analysis.groupby('category').median()[['Inv/PO']].plot(kind='bar',ax=axes[2], title='Median Invoices per PO')
ax3.set_ylim(0,3)
ax4 = Department_Analysis.groupby('category').median()[['Spend_Diffusion']].plot(kind='bar',ax=axes[3], title='median spend diffusion per Category')
ax4.set_ylim(0,1)
ax5 = Department_Analysis.groupby('category').mean()[['Spend Date']].plot(kind='bar',ax=axes[4], title='median months per year ordering')


Silouette Score: 0.59339331472
('Variance Explained By Feature: [ 0.55742215  0.20275371  0.14274583]', 0.90292169815142909)
C:\Anaconda\lib\site-packages\matplotlib\axes.py:4747: UserWarning: No labeled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labeled objects found. "

In [8]:
#Print all the members of one or more of the categories
Department_Analysis[Department_Analysis['category'] == 0].loc['UCSF']


Out[8]:
BIOEXPRESS LLC CLONTECH A TAKARA BIO CO CLONTECH LABS INC DENVILLE SCIENTIFIC LIFE TECHNOLOGIES CORP NEW ENGLAND BIOLABS SIGMA ALDRICH INC SPECTRUM CHEMICALS & LABORATORY PRODUCTS SPECTRUM LABORATORY PRODUCTS THERMO FISHER SCIENTIFIC INC USA SCIENTIFIC INC VWR INTERNATIONAL INC Inv/PO Count of Suppliers Spend_Diffusion Spend Date % Opt-Actual BIOEXPRESS LLC_stdard CLONTECH A TAKARA BIO CO_stdard CLONTECH LABS INC_stdard
Department Name
- 1205.70 0 194.58 0.00 5516.20 78.68 6368.96 92.80 0.00 22823.32 41.23 3425.90 1.028571 9 0.425790 1 0.520986 0.196028 -0.045716 -0.046903 ...
AIDS RESEARCH INSTITUTE 0.00 0 0.00 0.00 7301.02 0.00 12191.21 0.00 0.00 30334.45 2459.02 5745.47 1.020619 5 0.477273 9 0.403409 -0.145736 -0.045716 -0.084548 ...
ANATOMY 11269.82 0 3446.92 14872.57 92476.40 1547.56 46422.29 6340.47 0.00 185469.08 5212.28 62498.97 1.014934 10 0.568231 9 0.368632 3.048766 -0.045716 0.582326 ...
ANESTHESIA/PERIOPERATIVE CARE 3561.25 0 2582.65 1396.99 63520.46 1757.83 27484.43 3354.69 0.00 175594.56 1530.73 60200.77 1.020481 10 0.485036 9 0.461071 0.863723 -0.045716 0.415116 ...
ANESTHESIA/YOUNG 261.08 0 0.00 0.00 7020.69 0.00 4942.30 834.72 0.00 17630.43 0.00 2277.59 1.000000 6 0.465207 8 0.441752 -0.071731 -0.045716 -0.084548 ...
BIOCHEMISTRY & BIOPHYSICS 8318.41 0 8487.58 4918.59 157263.21 9532.90 95368.43 16076.97 0.00 332309.85 31578.15 129086.25 1.023240 10 0.580914 9 0.354540 2.212171 -0.045716 1.557541 ...
BIOENGINEERING&THERAPEUTIC SCI 7023.69 0 4525.42 9485.80 72650.32 9285.00 38702.51 9514.55 662.15 126439.83 4276.69 66468.89 1.019768 11 0.637744 8 0.298481 1.845174 -0.045716 0.790983 ...
BIOENGINEERING/THERAPEUTIC SCI 911.58 0 0.00 190.62 7261.26 261.76 5877.77 3450.66 426.16 19969.72 703.09 7419.53 1.013889 10 0.570286 1 0.366349 0.112658 -0.045716 -0.084548 ...
CANCER CENTER 1288.98 0 0.00 1187.31 35167.65 0.00 7228.97 274.85 0.00 12651.14 2692.98 828.46 1.031847 8 0.426493 7 0.512580 0.219634 -0.045716 -0.084548 ...
CARDIOVASCULAR RESEARCH INST 25155.64 0 4270.48 1873.23 223537.60 5236.08 80937.57 8112.50 743.55 395968.31 22071.81 257702.98 1.031004 11 0.613919 9 0.324689 6.984790 -0.045716 0.741660 ...
CELL CULTURE 0.00 0 59133.15 15426.85 1273383.30 303909.75 153476.36 0.00 0.00 63653.92 0.00 94616.30 1.672156 7 0.351506 9 0.589910 -0.145736 -0.045716 11.355924 ...
CELLULAR & MOLECULAR PHARMACOL 7152.32 0 9258.49 6612.66 205826.81 19916.65 95316.99 9297.22 0.00 392417.69 14378.35 218049.10 1.026001 10 0.598848 9 0.334614 1.881635 -0.045716 1.706689 ...
CLINICAL LABS SFGH 0.00 0 0.00 0.00 880.28 0.00 37244.77 0.00 0.00 85117.91 0.00 7385.47 1.069565 4 0.348397 9 0.535471 -0.145736 -0.045716 -0.084548 ...
CLINICAL PHARMACY 0.00 0 0.00 0.00 0.00 0.00 112.13 1095.15 0.00 20509.84 0.00 8916.89 1.034722 4 0.330488 8 0.559349 -0.145736 -0.045716 -0.084548 ...
CTSI-CRS-ALL SITES 115.66 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 22601.76 0.00 1814.76 1.010989 3 0.078689 9 0.881966 -0.112951 -0.045716 -0.084548 ...
CTSI-SOM SOS AWARDS 46.33 0 0.00 0.00 2861.41 70.85 4172.86 487.33 0.00 12006.07 0.00 1920.00 1.020979 7 0.443257 8 0.482866 -0.132603 -0.045716 -0.084548 ...
CTSI-SOM-MEDICINE 0.00 0 0.00 2277.29 8549.58 372.63 8794.88 389.87 0.00 17751.37 66.34 4855.57 1.008621 8 0.587729 6 0.328310 -0.145736 -0.045716 -0.084548 ...
CTSI-SOM-NEUROLOGY 0.00 0 0.00 0.00 1629.90 0.00 182.67 0.00 0.00 6282.23 154.49 587.38 1.000000 5 0.289073 4 0.638659 -0.145736 -0.045716 -0.084548 ...
CTSI-SOM-SURGERY 206.00 0 0.00 0.00 182.01 0.00 255.63 0.00 0.00 13853.91 0.00 2152.30 1.041667 5 0.167926 8 0.790093 -0.087344 -0.045716 -0.084548 ...
CTSI-SOP-BIOPHARM SCIENCES 115.66 0 0.00 0.00 0.00 0.00 1261.89 80.66 0.00 902.48 0.00 330.95 1.000000 5 0.531182 3 0.336023 -0.112951 -0.045716 -0.084548 ...
CTSI-TRAINING 0.00 0 0.00 0.00 1839.37 0.00 161.17 0.00 0.00 6838.70 0.00 1513.56 1.000000 4 0.339435 6 0.547420 -0.145736 -0.045716 -0.084548 ...
DEAN'S OFF GRAD RESCH AFFAIRS 0.00 0 0.00 0.00 1001.78 0.00 481.91 0.00 0.00 2859.02 0.00 645.53 1.000000 4 0.426848 7 0.430869 -0.145736 -0.045716 -0.084548 ...
DENT-ORAL & MAX SURGERY 0.00 0 0.00 0.00 2537.65 0.00 5392.33 0.00 0.00 4667.22 58.32 1217.28 1.011765 5 0.611302 9 0.235873 -0.145736 -0.045716 -0.084548 ...
DEPT OF CELL & TISSUE BIOLOGY 2627.18 0 3053.23 1112.29 65264.21 858.20 23233.10 3977.67 0.00 110307.38 5452.80 46729.88 1.049474 10 0.579967 9 0.355592 0.598955 -0.045716 0.506159 ...
DEPT OF OROFACIAL SCIENCES 0.00 0 0.00 0.00 553.94 114.35 139.48 630.93 0.00 7976.96 0.00 1247.93 1.064103 6 0.251944 9 0.697667 -0.145736 -0.045716 -0.084548 ...
DERMATOLOGY 3561.01 0 261.61 0.00 52213.52 284.27 8662.67 818.92 0.00 80552.83 632.35 68420.24 1.021978 9 0.626044 9 0.295700 0.863655 -0.045716 -0.033935 ...
DIABETES CENTER 14499.05 0 5921.76 106.36 84069.58 5992.44 27647.48 3058.53 0.00 151960.28 980.38 73067.91 1.014196 10 0.586282 8 0.348576 3.964112 -0.045716 1.061133 ...
DISTRIBUTION & STORAGE 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 301089.12 0.00 287740.46 1.205128 2 0.488665 9 0.022670 -0.145736 -0.045716 -0.084548 ...
DRUG STUDIES UNIT 0.00 0 0.00 0.00 0.00 0.00 280.20 5352.42 0.00 3341.80 849.21 117.26 1.000000 5 0.461575 7 0.423031 -0.145736 -0.045716 -0.084548 ...
ENVIRONMENTAL HEALTH & SAFETY 0.00 0 0.00 0.00 0.00 0.00 0.00 7232.89 0.00 8589.08 0.00 2196.89 1.000000 3 0.523328 9 0.215007 -0.145736 -0.045716 -0.084548 ...
EPIDEMIOLOGY & BIOSTATISTICS 238.55 0 73.84 234.45 5522.30 114.73 1604.88 17.71 0.00 420953.75 614.12 2141.12 1.484536 10 0.024476 9 0.972805 -0.078117 -0.045716 -0.070262 ...
FAMILY HLTH CARE NSG 217.80 0 0.00 0.00 710.87 0.00 153.69 0.00 0.00 326.39 0.00 945.66 1.000000 5 0.598345 4 0.252068 -0.083999 -0.045716 -0.084548 ...
GLOBAL HEALTH GROUP 0.00 0 0.00 0.00 1043.06 0.00 89.85 31.93 0.00 1422.94 0.00 144.83 1.000000 5 0.479274 4 0.400907 -0.145736 -0.045716 -0.084548 ...
GRADUATE DIVISION 67.21 0 0.00 0.00 6075.20 381.71 668.88 224.49 0.00 10334.31 68.97 2803.46 1.016393 8 0.498924 9 0.429801 -0.126684 -0.045716 -0.084548 ...
GREENBLATT RESEARCH UNIT 0.00 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 8961.51 0.00 8244.91 1.075000 2 0.479176 8 0.041647 -0.145736 -0.045716 -0.084548 ...
HDFCCC OPERATIONS & ADMIN 1574.47 0 0.00 0.00 104233.78 0.00 2375.05 762.43 0.00 61999.09 226.74 29786.66 1.020290 7 0.481316 9 0.438464 0.300558 -0.045716 -0.084548 ...
HDFCCC PI LABS 7306.10 0 2791.21 5640.22 112343.91 1971.58 44517.61 1750.01 0.00 136621.05 6918.01 51399.32 1.050089 10 0.632006 9 0.297771 1.925225 -0.045716 0.455466 ...
HOOPER FOUNDATION 0.00 0 0.00 0.00 13613.86 1014.68 9219.66 739.16 0.00 45371.51 3016.90 7745.49 1.044776 7 0.437924 9 0.489089 -0.145736 -0.045716 -0.084548 ...
HORMONE RESEARCH LAB 1804.38 0 297.70 1228.79 21895.20 1256.28 9771.23 573.43 0.00 14089.12 436.83 1894.28 1.042328 10 0.588801 9 0.345776 0.365727 -0.045716 -0.026952 ...
HOWARD HUGHES MEDICAL INST 0.00 0 0.00 0.00 732.55 210.92 42.72 143.08 0.00 1768.34 0.00 585.94 1.000000 6 0.492374 1 0.409151 -0.145736 -0.045716 -0.084548 ...
INST FOR NEURODEGENERATIVE DIS 10029.86 0 905.09 160.95 37077.24 306.51 24799.19 4951.10 0.00 452258.49 1753.36 63282.75 1.019231 10 0.240571 9 0.732699 2.697292 -0.045716 0.090559 ...
INST FOR REGENERATION MEDICINE 1271.18 0 6804.79 10092.86 187255.68 1084.83 50282.54 3236.20 0.00 153014.95 10985.84 119666.07 1.042237 10 0.655587 9 0.271570 0.214588 -0.045716 1.231972 ...
INSTITUTE FOR HUMAN GENETICS 5510.68 0 1429.95 537.15 80303.61 356.70 5182.39 618.80 0.00 59628.78 1320.23 8295.80 1.025397 10 0.507896 9 0.435672 1.416302 -0.045716 0.192104 ...
LAB ANIMAL RESOURCE CENTER 166.28 0 0.00 0.00 0.00 0.00 209.08 3018.08 0.00 56557.66 0.00 56152.97 1.024876 5 0.512871 9 0.358911 -0.098602 -0.045716 -0.084548 ...
LABORATORY MEDICINE 3668.74 0 66747.86 583.58 133229.29 4522.80 17740.76 3123.08 0.00 113225.04 28078.90 193111.46 1.052427 10 0.657623 9 0.269308 0.894192 -0.045716 12.829140 ...
LABORATORY MEDICINE-VAN'T VEER 0.00 0 427.80 0.00 1572.16 0.00 81.41 593.84 0.00 10029.78 2366.75 22605.97 1.039216 7 0.400017 9 0.533313 -0.145736 -0.045716 -0.001782 ...
LPPI: INSTR & RESEARCH 127.40 0 982.30 0.00 22274.86 74.95 14725.61 1586.19 0.00 45782.76 1405.98 22951.12 1.024793 9 0.583457 9 0.343611 -0.109623 -0.045716 0.105497 ...
MASS SPECTROMETRY 71.61 0 0.00 0.00 1455.89 0.00 2584.49 412.06 0.00 762193.61 444.06 3926.66 1.051020 7 0.011535 9 0.986542 -0.125437 -0.045716 -0.084548 ...
MED-CAMPUS-ADMIN 39.34 0 0.00 176.41 9640.63 0.00 2116.77 0.00 0.00 90537.25 346.86 45467.67 1.089041 7 0.389602 9 0.545464 -0.134584 -0.045716 -0.084548 ...
MED-CAMPUS-CARDIO 2225.04 0 2138.11 212.56 9683.48 154.75 1293.49 293.15 0.00 16022.55 259.26 16473.59 1.035354 10 0.662122 9 0.264309 0.484966 -0.045716 0.329111 ...
MED-CAMPUS-GENOMIC MEDICINE 40.96 0 198.10 0.00 191.48 0.00 856.65 0.00 0.00 431.63 0.00 499.99 1.100000 6 0.613915 3 0.263302 -0.134125 -0.045716 -0.046222 ...
MED-CAMPUS-GI 9942.37 0 7370.78 6793.71 100257.21 1009.15 21293.49 3568.53 0.00 91350.71 4457.57 39582.08 1.019405 10 0.648991 9 0.278899 2.672492 -0.045716 1.341474 ...
MED-CAMPUS-HEM ONC 20897.44 0 4440.27 7965.43 124005.88 4278.82 29481.05 3590.66 0.00 240916.41 5271.05 55674.43 1.025153 10 0.514792 9 0.428009 5.777777 -0.045716 0.774510 ...
MED-CAMPUS-ID 997.08 0 6882.70 561.62 18659.46 1473.45 14906.26 152.40 0.00 62204.14 911.89 26321.17 1.035055 10 0.532546 9 0.408282 0.136893 -0.045716 1.247046 ...
MED-CAMPUS-NEPHROLOGY 1685.32 0 0.00 0.00 17204.95 0.00 14320.42 2791.92 0.00 58350.11 377.36 29325.06 1.028490 7 0.529644 9 0.382082 0.331979 -0.045716 -0.084548 ...
MED-CAMPUS-PREVENTION SCIENCE 0.00 0 0.00 0.00 0.00 0.00 40.24 39.50 0.00 316.89 0.00 0.00 1.000000 3 0.201044 6 0.698434 -0.145736 -0.045716 -0.084548 ...
MED-CAMPUS-PULMONARY 4836.93 0 1663.76 0.00 48756.12 293.08 26140.55 3760.16 0.00 81617.52 3292.10 26246.79 1.020024 9 0.584870 9 0.342022 1.225323 -0.045716 0.237339 ...
MED-CAMPUS-RHEUMATOLOGY 961.78 0 744.74 103.97 10108.02 313.58 6450.45 49.42 0.00 32284.19 6400.46 8188.82 1.062762 10 0.507904 9 0.435663 0.126887 -0.045716 0.059536 ...
MED-GERIATRICS 185.94 0 110.34 0.00 1966.83 0.00 110.49 212.68 0.00 3106.78 0.00 404.91 1.038462 7 0.490522 5 0.427724 -0.093030 -0.045716 -0.063201 ...
MED-SFGH-ADMIN 0.00 0 0.00 0.00 11900.54 0.00 1419.39 98.66 0.00 26223.27 0.00 1587.98 1.056180 5 0.363974 8 0.545033 -0.145736 -0.045716 -0.084548 ...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...

132 rows × 35 columns


In [44]:
#Department Histogram Data (ignore the header)
#Department_Analysis.groupby('category').count()[[1]]

#Supplier Spend by Category
#Department_Analysis.groupby('category').sum()[[x for x in Department_Analysis.columns[0:N_Suppliers]]]

#Median INV/PO per cluster
#Department_Analysis.groupby('category').median()[['Inv/PO']]

#Median Spend Diffusion per cluster
#Department_Analysis.groupby('category').median()[['Spend_Diffusion']]

#Median Spend Date
Department_Analysis.groupby('category').median()[['Spend Date']]


Out[44]:
Spend Date
category
0 10
1 3
2 11
3 1
4 4

In [9]:
Department_Analysis.to_csv('c:/users/aclark/desktop/clustered.csv')

In [ ]: