KMeans

Initialized cluster centers, assigning points to each cluster and repeating this process trying to minimize Euclidean distance from points to cluster centers with each iteration.

Outliers were calculated as being the point that was the furthest Euclidean distance away from its cluster center


In [6]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
import 	numpy as np
import re
import sys
import csv
from sklearn import preprocessing
import sklearn.metrics as met
from sklearn.decomposition import PCA
import matplotlib.cm as cm


df1=pd.read_csv('team_out_1.csv')
df2=pd.read_csv('team_out_a2.csv')
df3=pd.read_csv('team_out_a3.csv')
df4=pd.read_csv('team_out_Yash.csv')
df5=pd.read_csv('team_out_Yash_part1.csv')

df=df1.append(df2)
df=df.append(df3)
df=df.append(df4)
df=df.append(df5)


df.dropna(inplace=True)
df.reset_index(inplace=True,drop=True)
# df=df[df.Total_Expenses>0]
df


Out[6]:
Filename EIN Program_Exp Liabilities_To_Asset Working_Capital Surplus_Margin Total_Expenses
0 201523229349300327.xml 510311790.0 0.989619 0.091802 1.574677 -0.078663 464318.0
1 201543089349301829.xml 261460932.0 0.965378 0.000000 3.910347 -0.042674 27439.0
2 201533179349306298.xml 270609504.0 0.942276 0.049206 0.655152 0.088597 384828.0
3 201533209349304768.xml 521548962.0 0.384526 0.000814 30.392756 0.633126 50912.0
4 201533179349307343.xml 731653383.0 0.620560 0.029487 1.083739 0.105901 28123.0
5 201533189349300608.xml 237324566.0 0.758902 0.736982 -0.176215 -0.003914 608126.0
6 201523069349301367.xml 43259150.0 0.736312 0.181998 0.378592 0.057534 212968.0
7 201533069349300963.xml 621273871.0 0.644822 0.234247 14.413163 0.496122 1426413.0
8 201523099349300542.xml 541897455.0 0.845659 0.138531 0.414986 -0.016769 461512.0
9 201533099349301033.xml 251869168.0 0.762357 0.000000 8.673984 0.122244 102044.0
10 201523169349304367.xml 376046335.0 0.745386 0.000000 16.463976 0.252579 37170.0
11 201533099349301803.xml 50454409.0 0.915261 0.000000 1.158768 0.062649 557347.0
12 201523069349300142.xml 272364809.0 0.920307 0.003606 0.240597 -0.029663 302061.0
13 201543039349301204.xml 463812139.0 0.664088 0.648413 0.045823 0.088812 139974.0
14 201523089349301462.xml 561797737.0 0.704623 0.432671 0.000000 -0.050384 214773.0
15 201533069349300788.xml 590638867.0 0.000000 0.000000 0.360157 -0.055883 762522.0
16 201533079349300238.xml 273314429.0 1.000000 0.000000 0.000000 0.107584 26171.0
17 201523209349314257.xml 710801566.0 1.000000 0.000000 0.000000 -8.212337 1145333.0
18 201523209349311332.xml 752204594.0 0.896895 0.069715 0.535678 -0.064100 3889691.0
19 201533179349302173.xml 251374594.0 0.000000 0.313630 0.337537 0.014809 423512.0
20 201533179349307048.xml 450448773.0 0.991205 0.051348 0.414930 0.161552 185894.0
21 201533209349302633.xml 363789851.0 0.000000 0.232011 0.521133 -0.128997 7329143.0
22 201533099349301113.xml 411712804.0 0.477697 0.009887 0.751388 -0.014507 814720.0
23 201523039349300127.xml 381557861.0 0.832159 0.063834 2.988120 0.170911 1369609.0
24 201523079349301652.xml 273741627.0 0.945120 0.000000 0.863663 0.094985 1234798.0
25 201533039349300813.xml 411495321.0 0.668161 0.040796 144.508488 -0.139777 1108143.0
26 201533139349300208.xml 60666277.0 0.000000 0.644904 0.654848 -0.296143 19649432.0
27 201533069349301413.xml 42616064.0 0.951218 0.338683 0.178517 0.041123 33254203.0
28 201533079349300003.xml 581651220.0 0.910347 0.026832 0.945357 -0.020293 324797.0
29 201523069349300957.xml 341496171.0 0.850617 0.068838 1.125712 0.212899 672775.0
... ... ... ... ... ... ... ...
75744 201542929349301039.xml 205158717.0 0.490545 0.000000 0.496350 0.000000 52195.0
75745 201542929349301104.xml 440296401.0 0.000000 0.001255 2.673444 -0.075404 3921954.0
75746 201542929349301204.xml 131084330.0 0.000000 0.949950 0.043382 -0.079563 49480300.0
75747 201503109349302210.xml 30228267.0 0.726896 0.048491 0.484947 0.059259 1275474.0
75748 201503109349302270.xml 741718905.0 0.000000 0.444349 0.536678 -0.021724 700826.0
75749 201522549349300117.xml 150249365.0 0.000000 0.000000 2.604291 1.828035 54487.0
75750 201522549349300127.xml 942763918.0 0.000000 0.126873 6.226369 -0.160318 61868.0
75751 201522549349300142.xml 453774366.0 0.000000 0.006878 1.221741 0.337512 387277.0
75752 201522549349300202.xml 860507921.0 0.751038 0.256169 1.011925 -0.045928 3732567.0
75753 201522549349300217.xml 236390816.0 0.000000 0.128085 19.697597 -2.047494 227081.0
75754 201522549349300247.xml 940689854.0 1.000000 0.562191 0.151220 -0.047202 291609.0
75755 201522549349300317.xml 830164620.0 0.000000 0.009078 5.486683 0.127495 188778.0
75756 201522549349300322.xml 274726130.0 0.991514 0.003792 1.651461 0.392873 1796484.0
75757 201503139349301280.xml 810677279.0 0.564601 0.016695 4.338949 0.529888 85951.0
75758 201503139349301285.xml 232090256.0 0.767743 1.176003 -0.257737 -0.065425 16050928.0
75759 201503139349301295.xml 202902396.0 0.009901 0.004141 1704.089109 0.979933 808.0
75760 201503139349301300.xml 222471909.0 0.940847 1.457766 -1.793719 0.013922 469665.0
75761 201503139349301315.xml 50359008.0 0.634199 0.006712 3.553722 0.199138 179469.0
75762 201503139349301335.xml 20530732.0 0.796464 0.874227 0.345519 -0.024264 691125.0
75763 201503139349301400.xml 60668594.0 0.858908 0.183348 0.809719 -0.114802 28331141.0
75764 201503139349301405.xml 570884504.0 0.960711 0.202614 4.412177 -0.360560 57650.0
75765 201513159349303976.xml 911075950.0 0.912792 0.112852 6.978676 0.023383 1188586.0
75766 201513159349303991.xml 450537391.0 0.882861 1.000000 0.000000 0.000000 33214334.0
75767 201513159349304006.xml 630985623.0 1.000000 0.164163 1.350470 0.070756 201749.0
75768 201513159349304021.xml 930854620.0 0.221201 0.000121 14.605622 0.075426 62260.0
75769 201513159349304046.xml 263218152.0 0.661838 0.163034 0.730298 -0.017285 508851.0
75770 201513159349304051.xml 520887806.0 0.819248 0.417553 0.537307 0.028730 5894235.0
75771 201513159349304061.xml 942608741.0 0.893704 0.009907 1.638578 0.044649 243668.0
75772 201513159349304071.xml 330841281.0 0.807812 0.342269 0.836017 0.297505 25594615.0
75773 201513159349304076.xml 570751500.0 0.887786 0.415570 8.571501 0.063448 1610096.0

75774 rows × 7 columns


In [4]:
%matplotlib inline

temp_df=df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin','Total_Expenses']]


def evaluate_clusters(metrics_df,max_clusters):
    error = np.zeros(max_clusters+1)
    error[0] = 0;
    for k in range(1,max_clusters+1):
        kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
        kmeans.fit(metrics_df)
        error[k] = kmeans.inertia_

    plt.plot(range(1,len(error)),error[1:])
    plt.xlabel('Number of clusters')
    plt.ylabel('Error')


evaluate_clusters(temp_df,10)


Through the graph we determined number of clusters to be 5


In [7]:
class kmeans_clustering:

    def __init__(self, file_1, file_2, file_3,file_4,file_5):
        self.file1 = file_1
        self.file2 = file_2
        self.file3 = file_3
        self.file4 = file_4
        self.file5 = file_5
        self.df = pd.DataFrame()
        self.df_norm = pd.DataFrame()
        print('----Initialization complete-------')

    def read_data(self):
        f1 = open(self.file1, 'r')
        f2 = open(self.file2, 'r')
        f3 = open(self.file3, 'r')
        f4 = open(self.file4, 'r')
        f5 = open(self.file5, 'r')


        reader = csv.reader(f1)
        data1 = list(reader)

        reader = csv.reader(f2)
        data2 = list(reader)

        reader = csv.reader(f3)
        data3 = list(reader)
        
        reader = csv.reader(f4)
        data4 = list(reader)
        
        reader = csv.reader(f5)
        data5 = list(reader)

        print('data1 = ' + str(len(data1)))
        print('data2 = ' + str(len(data2)))
        print('data3 = ' + str(len(data3)))
        print('data4 = ' + str(len(data4)))
        print('data5 = ' + str(len(data5)))


        data1.extend(data2)
        print('data1 length after 1st merge = ' + str(len(data1)))

        data1.extend(data3)
        print('data1 length after (subtract 19120, have spaces after scraping): ' + str(len(data1)))
        
        data1.extend(data4)
        data1.extend(data5)
        

        fa = [ ]
        fb = [ ]
        fc = [ ]
        fd = [ ]
        fe = [ ]

        for i in range(0, len(data1)):
            if len(data1[i]) != 0:
                fa.append(float(data1[i][2])) # feature1:prog-expense-ratio
                fb.append(float(data1[i][3])) # feature2:asset-liability-ratio
                fc.append(float(data1[i][4])) # feature3:working-capital-ratio
                fd.append(float(data1[i][5])) # feature4:surplus-margin
                fe.append(float(data1[i][6])) # feature5:total-amt
		
        dfa = pd.DataFrame(fa)
        dfb = pd.DataFrame(fb)
        dfc = pd.DataFrame(fc)
        dfd = pd.DataFrame(fd)
        dfe = pd.DataFrame(fe)

        self.df = pd.concat([dfa, dfb, dfc, dfd, dfe], axis = 1)
		#print('dataframe df combined')
		#print(self.df)
	
    def create_clusters(self):
        array_norm  = preprocessing.normalize(self.df)
		#self.df_norm = pd.DataFrame(preprocessing.normalize(self.df))
        
        global norm_array
        norm_array=array_norm
        
        
        print('normalized dataframe')
        print(pd.DataFrame(array_norm))
        print('compute kmeans clusters')

        num = 2

		#Uncomment the following code if you want to evaluate best cluster#
# 		'''
# 		for i in range(0,10):
# 			kmeans = KMeans(init='k-means++', n_clusters=num, n_init=10)
# 			kmeans.fit_predict(array_norm)
# 			error = kmeans.inertia_
# 			#print(" Total error with " + str(num) + " clusters = " + str(error))
# 			num = num + 1
# 			score = met.silhouette_score(array_norm, kmeans.labels_, metric='euclidean',sample_size=1000)
# 			print('# clusters : ' + str(num) + 'silhoutte coefficent : ' + str(score))
# 		'''

		# Run kmeans on best clusters#
		
        kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)

        global k
        k=kmeans.fit_predict(array_norm)
        
        global cluster_labels
        cluster_labels = kmeans.labels_

        
        global cluster_centers
        cluster_centers=kmeans.cluster_centers_
        print('-------------------------------------')
        score = met.silhouette_score(array_norm, kmeans.labels_, metric='euclidean',sample_size=1000)
        print('silhoutte coefficent : ' + str(score))

        #PCA to lower dimensionality of the data
        pca_2 = PCA(2)
        plot_columns = pca_2.fit_transform(array_norm)
        
        plt.xlabel("x-axis")
        plt.ylabel("y-axis")
        plt.title("K-means++ clustering")

        i=0

        for sample in plot_columns:
            if kmeans.labels_[i] == 0:
                plt.scatter(sample[0],sample[1],color="c",s=75,marker="o")
            if kmeans.labels_[i] == 1:
                plt.scatter(sample[0],sample[1],s=75,marker="*",color="chartreuse")
            if kmeans.labels_[i] == 2:
                plt.scatter(sample[0],sample[1],color="green",s=75,marker="v")
            if kmeans.labels_[i] == 3:
                plt.scatter(sample[0],sample[1],color="cyan",s=75,marker="^")
            if kmeans.labels_[i] == 4:
                plt.scatter(sample[0],sample[1],color="red",s=75,marker="^")
            i += 1
        plt.show()


file_1 = 'team_out_1.txt'
file_2 = 'team_out_a2.txt'
file_3 = 'team_out_a3.txt'
file_4='team_out_Yash.txt'
file_5='team_out_Yash_part1.txt'

k=[]
norm_array=[]
cluster_centers=[]
cluster_labels = []
class_instance = kmeans_clustering(file_1, file_2, file_3,file_4,file_5)
class_instance.read_data()
class_instance.create_clusters()


----Initialization complete-------
data1 = 38240
data2 = 19794
data3 = 23592
data4 = 20410
data5 = 6126
data1 length after 1st merge = 58034
data1 length after (subtract 19120, have spaces after scraping): 81626
normalized dataframe
                  0             1             2             3         4
0      2.131339e-06  1.977141e-07  3.391377e-06 -1.694160e-07  1.000000
1      3.518269e-05  0.000000e+00  1.425105e-04 -1.555219e-06  1.000000
2      2.448563e-06  1.278658e-07  1.702455e-06  2.302254e-07  1.000000
3      7.552761e-06  1.598110e-08  5.969663e-04  1.243570e-05  1.000000
4      2.206591e-05  1.048490e-06  3.853569e-05  3.765625e-06  1.000000
5      1.247935e-06  1.211890e-06 -2.897675e-07 -6.435478e-09  1.000000
6      3.457386e-06  8.545792e-07  1.777695e-06  2.701533e-07  1.000000
7      4.520587e-07  1.642210e-07  1.010448e-05  3.478109e-07  1.000000
8      1.832367e-06  3.001670e-07  8.991878e-07 -3.633486e-08  1.000000
9      7.470870e-06  0.000000e+00  8.500239e-05  1.197957e-06  1.000000
10     2.005343e-05  0.000000e+00  4.429372e-04  6.795234e-06  1.000000
11     1.642175e-06  0.000000e+00  2.079079e-06  1.124058e-07  1.000000
12     3.046760e-06  1.193733e-08  7.965182e-07 -9.820306e-08  1.000000
13     4.744364e-06  4.632382e-06  3.273665e-07  6.344877e-07  1.000000
14     3.280780e-06  2.014551e-06  0.000000e+00 -2.345903e-07  1.000000
15     0.000000e+00  0.000000e+00  4.723240e-07 -7.328753e-08  1.000000
16     3.821023e-05  0.000000e+00  0.000000e+00  4.110799e-06  1.000000
17     8.731085e-07  0.000000e+00  0.000000e+00 -7.170261e-06  1.000000
18     2.305827e-07  1.792308e-08  1.377174e-07 -1.647954e-08  1.000000
19     0.000000e+00  7.405453e-07  7.969953e-07  3.496711e-08  1.000000
20     5.332096e-06  2.762217e-07  2.232079e-06  8.690540e-07  1.000000
21     0.000000e+00  3.165589e-08  7.110422e-08 -1.760049e-08  1.000000
22     5.863323e-07  1.213551e-08  9.222656e-07 -1.780591e-08  1.000000
23     6.075890e-07  4.660744e-08  2.181732e-06  1.247884e-07  1.000000
24     7.654044e-07  0.000000e+00  6.994364e-07  7.692317e-08  1.000000
25     6.029556e-07  3.681503e-08  1.304060e-04 -1.261360e-07  1.000000
26     0.000000e+00  3.282049e-08  3.332658e-08 -1.507132e-08  1.000000
27     2.860444e-08  1.018467e-08  5.368263e-09  1.236626e-09  1.000000
28     2.802819e-06  8.261259e-08  2.910608e-06 -6.247890e-08  1.000000
29     1.264341e-06  1.023200e-07  1.673237e-06  3.164497e-07  1.000000
...             ...           ...           ...           ...       ...
75744  9.398315e-06  0.000000e+00  9.509536e-06  0.000000e+00  1.000000
75745  0.000000e+00  3.200816e-10  6.816613e-07 -1.922601e-08  1.000000
75746  0.000000e+00  1.919854e-08  8.767459e-10 -1.607975e-09  1.000000
75747  5.699027e-07  3.801801e-08  3.802091e-07  4.646013e-08  1.000000
75748  0.000000e+00  6.340360e-07  7.657794e-07 -3.099764e-08  1.000000
75749  0.000000e+00  0.000000e+00  4.779656e-05  3.354993e-05  1.000000
75750  0.000000e+00  2.050710e-06  1.006396e-04 -2.591285e-06  1.000000
75751  0.000000e+00  1.776056e-08  3.154694e-06  8.715013e-07  1.000000
75752  2.012123e-07  6.863091e-08  2.711069e-07 -1.230454e-08  1.000000
75753  0.000000e+00  5.640484e-07  8.674260e-05 -9.016582e-06  1.000000
75754  3.429249e-06  1.927893e-06  5.185698e-07 -1.618661e-07  1.000000
75755  0.000000e+00  4.808916e-08  2.906421e-05  6.753696e-07  1.000000
75756  5.519192e-07  2.110600e-09  9.192742e-07  2.186902e-07  1.000000
75757  6.568868e-06  1.942418e-07  5.048166e-05  6.165004e-06  1.000000
75758  4.783169e-08  7.326701e-08 -1.605745e-08 -4.076068e-09  1.000000
75759  5.249886e-06  2.195538e-06  9.035736e-01  5.195984e-04  0.428433
75760  2.003230e-06  3.103842e-06 -3.819145e-06  2.964244e-08  1.000000
75761  3.533750e-06  3.740158e-08  1.980132e-05  1.109597e-06  1.000000
75762  1.152416e-06  1.264933e-06  4.999374e-07 -3.510865e-08  1.000000
75763  3.031673e-08  6.471599e-09  2.858051e-08 -4.052156e-09  1.000000
75764  1.666455e-05  3.514561e-06  7.653386e-05 -6.254289e-06  1.000000
75765  7.679648e-07  9.494667e-08  5.871411e-06  1.967259e-08  1.000000
75766  2.658072e-08  3.010748e-08  0.000000e+00  0.000000e+00  1.000000
75767  4.956654e-06  8.137010e-07  6.693813e-06  3.507152e-07  1.000000
75768  3.552866e-06  1.942938e-09  2.345908e-04  1.211465e-06  1.000000
75769  1.300652e-06  3.203959e-07  1.435191e-06 -3.396852e-08  1.000000
75770  1.389913e-07  7.084098e-08  9.115801e-08  4.874215e-09  1.000000
75771  3.667711e-06  4.065639e-08  6.724633e-06  1.832371e-07  1.000000
75772  3.156181e-08  1.337271e-08  3.266378e-08  1.162375e-08  1.000000
75773  5.513871e-07  2.581028e-07  5.323596e-06  3.940635e-08  1.000000

[75774 rows x 5 columns]
compute kmeans clusters
-------------------------------------
silhoutte coefficent : 0.997794498966

In [64]:
#OUTLIER ANALYSIS
from sklearn import preprocessing

def distance(v1,centroid):
    maxx = 0
    for i,value in enumerate(v1):
        temp1 = np.sqrt(np.sum((v1[i]-centroid)**2))
        if(temp1>maxx):
            maxx = temp1
            max_val = []
            max_val.append(value)
            max_val.append(i)
            max_val.append(temp1)
    return max_val


cluster1 = np.where(cluster_labels==0)
cluster2 = np.where(cluster_labels==1)
cluster3 = np.where(cluster_labels==2)
cluster4 = np.where(cluster_labels==3)
cluster5 = np.where(cluster_labels==4)

norm_df=df.copy()
norm_df=norm_df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin','Total_Expenses']]
x = norm_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
norm_df = pd.DataFrame(x_scaled)

outliers = pd.DataFrame()
outliers_index = []

cluster1_entries = norm_df.loc[cluster1].as_matrix()
outlier_cluster1 = distance(cluster1_entries,cluster_centers[0])
outliers_index.append(cluster1[0][outlier_cluster1[1]])

cluster2_entries = norm_df.loc[cluster2].as_matrix()
outlier_cluster2 = distance(cluster2_entries,cluster_centers[1])
outliers_index.append(cluster2[0][outlier_cluster2[1]])

cluster3_entries = norm_df.loc[cluster3].as_matrix()
outlier_cluster3 = distance(cluster3_entries,cluster_centers[2])
outliers_index.append(cluster3[0][outlier_cluster3[1]])

cluster4_entries = norm_df.loc[cluster4].as_matrix()
outlier_cluster4 = distance(cluster4_entries,cluster_centers[3])
outliers_index.append(cluster4[0][outlier_cluster4[1]])

cluster5_entries = norm_df.loc[cluster5].as_matrix()
outlier_cluster5 = distance(cluster5_entries,cluster_centers[4])
outliers_index.append(cluster5[0][outlier_cluster5[1]])

df.loc[outliers_index]


Out[64]:
Filename EIN Program_Exp Liabilities_To_Asset Working_Capital Surplus_Margin Total_Expenses
31116 201610419349301801.xml 521221108.0 0.400001 16883.315790 -1.915329 -9.796983 167472.0
49876 201641329349301804.xml 232787307.0 1.000000 2728.935484 -234.905556 0.000000 360.0
70485 201513299349300711.xml 481252775.0 0.000000 0.000000 0.000000 0.008741 0.0
54408 201610439349303016.xml 561949970.0 1.000000 0.000000 1.000000 0.000000 1.0
2827 201542589349300999.xml 352090479.0 0.857410 0.217362 298.930677 -12877.666670 4818.0

Results

These 5 businesses are the outliers for each of their clusters, they can all be seen as financially inefficient because their metrics are lacking in some aspects. For instance the first two have very high liabilities to assets ratio but low working capital ratio. On the other hand, the last row has very high working capital ratio but low and very negative surplus margin