Find core samples of high density and expand clusters from them.
The minimum number of samples in a neighborhood for a point to be considered as a core point was set at 10 and the maximum distance between two samples within the same neighborhood was set at 0.1.
In [55]:
import pandas as pd
df1=pd.read_csv('team_out_1.csv')
df2=pd.read_csv('team_out_a2.csv')
df=df1.append(df2)
df.dropna(inplace=True)
df.reset_index(inplace=True,drop=True)
df=df[df.Total_Expenses>0]
df=df[df.Program_Exp<=1]
df
Out[55]:
Filename
EIN
Program_Exp
Liabilities_To_Asset
Working_Capital
Surplus_Margin
Total_Expenses
0
201523229349300327.xml
510311790.0
0.989619
0.091802
1.574677
-0.078663
4.643180e+05
1
201543089349301829.xml
261460932.0
0.965378
0.000000
3.910347
-0.042674
2.743900e+04
2
201533179349306298.xml
270609504.0
0.942276
0.049206
0.655152
0.088597
3.848280e+05
3
201533209349304768.xml
521548962.0
0.384526
0.000814
30.392756
0.633126
5.091200e+04
4
201533179349307343.xml
731653383.0
0.620560
0.029487
1.083739
0.105901
2.812300e+04
5
201533189349300608.xml
237324566.0
0.758902
0.736982
-0.176215
-0.003914
6.081260e+05
6
201523069349301367.xml
43259150.0
0.736312
0.181998
0.378592
0.057534
2.129680e+05
7
201533069349300963.xml
621273871.0
0.644822
0.234247
14.413163
0.496122
1.426413e+06
8
201523099349300542.xml
541897455.0
0.845659
0.138531
0.414986
-0.016769
4.615120e+05
9
201533099349301033.xml
251869168.0
0.762357
0.000000
8.673984
0.122244
1.020440e+05
10
201523169349304367.xml
376046335.0
0.745386
0.000000
16.463976
0.252579
3.717000e+04
11
201533099349301803.xml
50454409.0
0.915261
0.000000
1.158768
0.062649
5.573470e+05
12
201523069349300142.xml
272364809.0
0.920307
0.003606
0.240597
-0.029663
3.020610e+05
13
201543039349301204.xml
463812139.0
0.664088
0.648413
0.045823
0.088812
1.399740e+05
14
201523089349301462.xml
561797737.0
0.704623
0.432671
0.000000
-0.050384
2.147730e+05
15
201533069349300788.xml
590638867.0
0.000000
0.000000
0.360157
-0.055883
7.625220e+05
16
201533079349300238.xml
273314429.0
1.000000
0.000000
0.000000
0.107584
2.617100e+04
17
201523209349314257.xml
710801566.0
1.000000
0.000000
0.000000
-8.212337
1.145333e+06
18
201523209349311332.xml
752204594.0
0.896895
0.069715
0.535678
-0.064100
3.889691e+06
19
201533179349302173.xml
251374594.0
0.000000
0.313630
0.337537
0.014809
4.235120e+05
20
201533179349307048.xml
450448773.0
0.991205
0.051348
0.414930
0.161552
1.858940e+05
21
201533209349302633.xml
363789851.0
0.000000
0.232011
0.521133
-0.128997
7.329143e+06
22
201533099349301113.xml
411712804.0
0.477697
0.009887
0.751388
-0.014507
8.147200e+05
23
201523039349300127.xml
381557861.0
0.832159
0.063834
2.988120
0.170911
1.369609e+06
24
201523079349301652.xml
273741627.0
0.945120
0.000000
0.863663
0.094985
1.234798e+06
25
201533039349300813.xml
411495321.0
0.668161
0.040796
144.508488
-0.139777
1.108143e+06
26
201533139349300208.xml
60666277.0
0.000000
0.644904
0.654848
-0.296143
1.964943e+07
27
201533069349301413.xml
42616064.0
0.951218
0.338683
0.178517
0.041123
3.325420e+07
28
201533079349300003.xml
581651220.0
0.910347
0.026832
0.945357
-0.020293
3.247970e+05
29
201523069349300957.xml
341496171.0
0.850617
0.068838
1.125712
0.212899
6.727750e+05
...
...
...
...
...
...
...
...
38884
201533099349301698.xml
912130056.0
0.760828
0.610684
0.104169
-2.163589
7.340370e+05
38885
201543119349300204.xml
237198698.0
0.819953
0.549109
0.248479
0.060347
3.518240e+05
38886
201543139349302814.xml
462250934.0
0.131636
0.960863
0.001425
0.005347
2.279017e+06
38887
201533219349301213.xml
231416559.0
0.598760
0.802416
0.248929
-0.207427
8.819110e+05
38888
201543179349306629.xml
223128187.0
0.621104
0.040543
1.574808
-0.133304
6.866690e+05
38889
201533209349316263.xml
731585237.0
0.849584
0.133037
0.508528
-0.054146
2.441360e+05
38890
201543079349301044.xml
581736427.0
0.887224
1.390273
-1.014415
0.052991
1.413780e+05
38891
201533139349300123.xml
470841633.0
0.730878
0.237917
6.796036
0.615454
8.076930e+05
38892
201543179349305429.xml
943345498.0
0.570344
0.031022
3.196737
0.146015
5.820010e+05
38893
201543179349308719.xml
264795329.0
0.984882
1.067790
-0.013730
-0.013601
6.089029e+06
38894
201533179349302373.xml
262709818.0
1.000000
1.000000
0.000000
0.000000
3.369300e+04
38895
201620149349300127.xml
431129770.0
0.868905
0.203344
0.818979
0.041341
1.113451e+07
38896
201630129349300803.xml
592240502.0
0.858238
0.090567
1.552281
0.165716
6.398019e+06
38897
201620119349300422.xml
61462359.0
0.843057
0.087344
0.250055
-0.086286
6.898120e+05
38898
201630129349300723.xml
630652760.0
0.992892
0.301988
0.133494
0.000101
3.599537e+06
38899
201610139349300811.xml
341472960.0
1.000000
1.000000
0.000000
0.000000
2.762930e+05
38900
201600199349300500.xml
800182020.0
0.594838
0.141928
0.037224
-0.104980
1.481881e+06
38901
201600399349300955.xml
462754433.0
0.000000
0.000000
0.235223
0.095712
4.060960e+05
38902
201600369349301750.xml
742744885.0
0.975219
0.000000
0.452366
-0.045373
2.315450e+05
38903
201610399349300226.xml
471127421.0
0.138038
0.000000
2.097551
2.163548
1.495530e+05
38904
201620359349300612.xml
742244155.0
0.901125
0.559978
0.068062
0.008963
2.399374e+07
38905
201610429349302361.xml
940764760.0
0.992275
0.788509
0.034823
0.014275
5.107870e+05
38906
201610369349301226.xml
840588263.0
0.988925
0.080860
0.313222
0.166371
2.257280e+05
38907
201640359349300129.xml
20650275.0
0.681210
0.498341
0.213852
0.038597
3.040090e+05
38908
201600429349301735.xml
942418861.0
0.987789
0.015595
3.429593
0.526883
2.165501e+06
38909
201600359349302005.xml
223201871.0
0.907986
0.000000
3.052342
0.281392
2.319760e+05
38910
201600419349301130.xml
650599763.0
0.955581
4.173882
-0.414036
0.058141
1.489220e+05
38911
201640359349301614.xml
470461460.0
0.465235
0.014051
1.666884
0.182197
2.236610e+05
38912
201600429349302135.xml
990208381.0
0.852344
0.047928
0.815871
0.229352
1.879170e+05
38913
201610229349300741.xml
510216586.0
0.521938
0.746625
0.529489
0.026507
4.834544e+09
38657 rows × 7 columns
small_df=df[df.Total_Expenses<1000000]
med_df=df[df.Total_Expenses>1000000] med_df=med_df[df.Total_Expenses<10000000]
large_df=df[df.Total_Expenses<50000000] large_df=large_df[df.Total_Expenses>10000000]
national_df=df[df.Total_Expenses>50000000]
In [56]:
#Getting a list of positive businesses
temp=df[df.Program_Exp>.9]
temp=temp[temp.Liabilities_To_Asset<.2]
lst_temp=list(temp['EIN'])
In [58]:
df.reset_index(drop=True,inplace=True)
norm_df=df.copy()
norm_df=norm_df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin','Total_Expenses']]
from sklearn import preprocessing
x = norm_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
norm_df = pd.DataFrame(x_scaled)
norm_df["Filename"]=df['Filename']
norm_df["EIN"]=df['EIN']
norm_df.columns=['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin','Total_Expenses','Filename','EIN']
norm_df.set_index('EIN',inplace=True)
norm_df
Out[58]:
Program_Exp
Liabilities_To_Asset
Working_Capital
Surplus_Margin
Total_Expenses
Filename
EIN
510311790.0
0.989619
0.013484
0.463650
0.957231
0.000068
201523229349300327.xml
261460932.0
0.965378
0.013479
0.463657
0.957231
0.000004
201543089349301829.xml
270609504.0
0.942276
0.013482
0.463647
0.957232
0.000056
201533179349306298.xml
521548962.0
0.384526
0.013479
0.463740
0.957234
0.000007
201533209349304768.xml
731653383.0
0.620560
0.013480
0.463649
0.957232
0.000004
201533179349307343.xml
237324566.0
0.758902
0.013522
0.463645
0.957231
0.000089
201533189349300608.xml
43259150.0
0.736312
0.013489
0.463646
0.957231
0.000031
201523069349301367.xml
621273871.0
0.644822
0.013492
0.463690
0.957233
0.000209
201533069349300963.xml
541897455.0
0.845659
0.013487
0.463646
0.957231
0.000068
201523099349300542.xml
251869168.0
0.762357
0.013479
0.463672
0.957232
0.000015
201533099349301033.xml
376046335.0
0.745386
0.013479
0.463696
0.957232
0.000005
201523169349304367.xml
50454409.0
0.915261
0.013479
0.463649
0.957231
0.000082
201533099349301803.xml
272364809.0
0.920307
0.013479
0.463646
0.957231
0.000044
201523069349300142.xml
463812139.0
0.664088
0.013517
0.463645
0.957232
0.000020
201543039349301204.xml
561797737.0
0.704623
0.013504
0.463645
0.957231
0.000031
201523089349301462.xml
590638867.0
0.000000
0.013479
0.463646
0.957231
0.000112
201533069349300788.xml
273314429.0
1.000000
0.013479
0.463645
0.957232
0.000004
201533079349300238.xml
710801566.0
1.000000
0.013479
0.463645
0.957200
0.000168
201523209349314257.xml
752204594.0
0.896895
0.013483
0.463647
0.957231
0.000569
201523209349311332.xml
251374594.0
0.000000
0.013497
0.463646
0.957231
0.000062
201533179349302173.xml
450448773.0
0.991205
0.013482
0.463646
0.957232
0.000027
201533179349307048.xml
363789851.0
0.000000
0.013492
0.463647
0.957231
0.001072
201533209349302633.xml
411712804.0
0.477697
0.013479
0.463648
0.957231
0.000119
201533099349301113.xml
381557861.0
0.832159
0.013482
0.463654
0.957232
0.000200
201523039349300127.xml
273741627.0
0.945120
0.013479
0.463648
0.957232
0.000181
201523079349301652.xml
411495321.0
0.668161
0.013481
0.464095
0.957231
0.000162
201533039349300813.xml
60666277.0
0.000000
0.013516
0.463647
0.957230
0.002874
201533139349300208.xml
42616064.0
0.951218
0.013499
0.463646
0.957231
0.004864
201533069349301413.xml
581651220.0
0.910347
0.013480
0.463648
0.957231
0.000048
201533079349300003.xml
341496171.0
0.850617
0.013483
0.463649
0.957232
0.000098
201523069349300957.xml
...
...
...
...
...
...
...
912130056.0
0.760828
0.013514
0.463645
0.957223
0.000107
201533099349301698.xml
237198698.0
0.819953
0.013511
0.463646
0.957231
0.000051
201543119349300204.xml
462250934.0
0.131636
0.013535
0.463645
0.957231
0.000333
201543139349302814.xml
231416559.0
0.598760
0.013526
0.463646
0.957230
0.000129
201533219349301213.xml
223128187.0
0.621104
0.013481
0.463650
0.957231
0.000100
201543179349306629.xml
731585237.0
0.849584
0.013486
0.463647
0.957231
0.000036
201533209349316263.xml
581736427.0
0.887224
0.013560
0.463642
0.957231
0.000021
201543079349301044.xml
470841633.0
0.730878
0.013493
0.463666
0.957234
0.000118
201533139349300123.xml
943345498.0
0.570344
0.013481
0.463655
0.957232
0.000085
201543179349305429.xml
264795329.0
0.984882
0.013541
0.463645
0.957231
0.000891
201543179349308719.xml
262709818.0
1.000000
0.013537
0.463645
0.957231
0.000005
201533179349302373.xml
431129770.0
0.868905
0.013491
0.463648
0.957231
0.001629
201620149349300127.xml
592240502.0
0.858238
0.013484
0.463650
0.957232
0.000936
201630129349300803.xml
61462359.0
0.843057
0.013484
0.463646
0.957231
0.000101
201620119349300422.xml
630652760.0
0.992892
0.013496
0.463646
0.957231
0.000526
201630129349300723.xml
341472960.0
1.000000
0.013537
0.463645
0.957231
0.000040
201610139349300811.xml
800182020.0
0.594838
0.013487
0.463645
0.957231
0.000217
201600199349300500.xml
462754433.0
0.000000
0.013479
0.463646
0.957232
0.000059
201600399349300955.xml
742744885.0
0.975219
0.013479
0.463647
0.957231
0.000034
201600369349301750.xml
471127421.0
0.138038
0.013479
0.463652
0.957239
0.000022
201610399349300226.xml
742244155.0
0.901125
0.013511
0.463645
0.957231
0.003509
201620359349300612.xml
940764760.0
0.992275
0.013525
0.463645
0.957231
0.000075
201610429349302361.xml
840588263.0
0.988925
0.013483
0.463646
0.957232
0.000033
201610369349301226.xml
20650275.0
0.681210
0.013508
0.463646
0.957231
0.000044
201640359349300129.xml
942418861.0
0.987789
0.013480
0.463656
0.957233
0.000317
201600429349301735.xml
223201871.0
0.907986
0.013479
0.463655
0.957232
0.000034
201600359349302005.xml
650599763.0
0.955581
0.013723
0.463644
0.957231
0.000022
201600419349301130.xml
470461460.0
0.465235
0.013480
0.463650
0.957232
0.000033
201640359349301614.xml
990208381.0
0.852344
0.013482
0.463648
0.957232
0.000027
201600429349302135.xml
510216586.0
0.521938
0.013522
0.463647
0.957231
0.707131
201610229349300741.xml
38657 rows × 6 columns
In [59]:
df2 = df.copy()
df2.reset_index(inplace=True, drop = True)
print(norm_df2)
Y_class_df = pd.DataFrame()
X_class_df=norm_df.loc[lst_temp]
X_class_df['Efficiency'] = 1
Y_class_df['Efficiency'] = X_class_df['Efficiency']
X_class_df.drop('Efficiency', axis=1, inplace=True)
Program_Exp Liabilities_To_Asset Working_Capital Surplus_Margin \
0 0.684058 0.013484 0.463650 0.957231
1 0.667302 0.013479 0.463657 0.957231
2 0.651333 0.013482 0.463647 0.957232
3 0.265798 0.013479 0.463740 0.957234
4 0.428952 0.013480 0.463649 0.957232
5 0.524579 0.013522 0.463645 0.957231
6 0.508964 0.013489 0.463646 0.957231
7 0.445723 0.013492 0.463690 0.957233
8 0.584548 0.013487 0.463646 0.957231
9 0.526967 0.013479 0.463672 0.957232
10 0.515236 0.013479 0.463696 0.957232
11 0.632659 0.013479 0.463649 0.957231
12 0.636148 0.013479 0.463646 0.957231
13 0.459040 0.013517 0.463645 0.957232
14 0.487059 0.013504 0.463645 0.957231
15 0.000000 0.013479 0.463646 0.957231
16 0.691234 0.013479 0.463645 0.957232
17 0.691234 0.013479 0.463645 0.957200
18 0.619964 0.013483 0.463647 0.957231
19 0.000000 0.013497 0.463646 0.957231
20 0.685154 0.013482 0.463646 0.957232
21 0.000000 0.013492 0.463647 0.957231
22 0.330200 0.013479 0.463648 0.957231
23 0.575217 0.013482 0.463654 0.957232
24 0.653299 0.013479 0.463648 0.957232
25 0.461856 0.013481 0.464095 0.957231
26 0.000000 0.013516 0.463647 0.957230
27 0.657514 0.013499 0.463646 0.957231
28 0.629263 0.013480 0.463648 0.957231
29 0.587975 0.013483 0.463649 0.957232
... ... ... ... ...
38639 0.525910 0.013514 0.463645 0.957223
38640 0.566779 0.013511 0.463646 0.957231
38641 0.090991 0.013535 0.463645 0.957231
38642 0.413883 0.013526 0.463646 0.957230
38643 0.429328 0.013481 0.463650 0.957231
38644 0.587261 0.013486 0.463647 0.957231
38645 0.613279 0.013560 0.463642 0.957231
38646 0.505208 0.013493 0.463666 0.957234
38647 0.394241 0.013481 0.463655 0.957232
38648 0.680784 0.013541 0.463645 0.957231
38649 0.691234 0.013537 0.463645 0.957231
38650 0.600616 0.013491 0.463648 0.957231
38651 0.593243 0.013484 0.463650 0.957232
38652 0.582750 0.013484 0.463646 0.957231
38653 0.686320 0.013496 0.463646 0.957231
38654 0.691234 0.013537 0.463645 0.957231
38655 0.411172 0.013487 0.463645 0.957231
38656 0.000000 0.013479 0.463646 0.957232
38657 0.674104 0.013479 0.463647 0.957231
38658 0.095417 0.013479 0.463652 0.957239
38659 0.622888 0.013511 0.463645 0.957231
38660 0.685894 0.013525 0.463645 0.957231
38661 0.683578 0.013483 0.463646 0.957232
38662 0.470875 0.013508 0.463646 0.957231
38663 0.682793 0.013480 0.463656 0.957233
38664 0.627631 0.013479 0.463655 0.957232
38665 0.660530 0.013723 0.463644 0.957231
38666 0.321586 0.013480 0.463650 0.957232
38667 0.589169 0.013482 0.463648 0.957232
38668 0.360782 0.013522 0.463647 0.957231
Total_Expenses Filename
0 0.000068 201523229349300327.xml
1 0.000004 201543089349301829.xml
2 0.000056 201533179349306298.xml
3 0.000007 201533209349304768.xml
4 0.000004 201533179349307343.xml
5 0.000089 201533189349300608.xml
6 0.000031 201523069349301367.xml
7 0.000209 201533069349300963.xml
8 0.000068 201523099349300542.xml
9 0.000015 201533099349301033.xml
10 0.000005 201523169349304367.xml
11 0.000082 201533099349301803.xml
12 0.000044 201523069349300142.xml
13 0.000020 201543039349301204.xml
14 0.000031 201523089349301462.xml
15 0.000112 201533069349300788.xml
16 0.000004 201533079349300238.xml
17 0.000168 201523209349314257.xml
18 0.000569 201523209349311332.xml
19 0.000062 201533179349302173.xml
20 0.000027 201533179349307048.xml
21 0.001072 201533209349302633.xml
22 0.000119 201533099349301113.xml
23 0.000200 201523039349300127.xml
24 0.000181 201523079349301652.xml
25 0.000162 201533039349300813.xml
26 0.002874 201533139349300208.xml
27 0.004864 201533069349301413.xml
28 0.000048 201533079349300003.xml
29 0.000098 201523069349300957.xml
... ... ...
38639 0.000107 201533099349301698.xml
38640 0.000051 201543119349300204.xml
38641 0.000333 201543139349302814.xml
38642 0.000129 201533219349301213.xml
38643 0.000100 201543179349306629.xml
38644 0.000036 201533209349316263.xml
38645 0.000021 201543079349301044.xml
38646 0.000118 201533139349300123.xml
38647 0.000085 201543179349305429.xml
38648 0.000891 201543179349308719.xml
38649 0.000005 201533179349302373.xml
38650 0.001629 201620149349300127.xml
38651 0.000936 201630129349300803.xml
38652 0.000101 201620119349300422.xml
38653 0.000526 201630129349300723.xml
38654 0.000040 201610139349300811.xml
38655 0.000217 201600199349300500.xml
38656 0.000059 201600399349300955.xml
38657 0.000034 201600369349301750.xml
38658 0.000022 201610399349300226.xml
38659 0.003509 201620359349300612.xml
38660 0.000075 201610429349302361.xml
38661 0.000033 201610369349301226.xml
38662 0.000044 201640359349300129.xml
38663 0.000317 201600429349301735.xml
38664 0.000034 201600359349302005.xml
38665 0.000022 201600419349301130.xml
38666 0.000033 201640359349301614.xml
38667 0.000027 201600429349302135.xml
38668 0.707131 201610229349300741.xml
[38669 rows x 6 columns]
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [60]:
new_df=norm_df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin']]
new_df.reset_index(inplace=True,drop=True)
X_class_df=X_class_df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin']]
# X_class_df=X_class_df.drop(X_class_df.index[2]) #OUTLIER REMOVER
X_class_df.reset_index(inplace=True,drop=True)
In [61]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=7)
X = svd.fit_transform(new_df)
In [62]:
db = DBSCAN(eps=0.1, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
In [79]:
print(n_clusters_)
outliers_index = []
for i in range(len(labels)):
if(labels[i]==-1):
outliers_index.append(i)
#print(df2.loc[i])
outliers = df2.loc[outliers_index]
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = 'cyan'
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
outliers
1
Out[79]:
Filename
EIN
Program_Exp
Liabilities_To_Asset
Working_Capital
Surplus_Margin
Total_Expenses
2035
201502969349300235.xml
680383921.0
0.000000
0.000000
172324.300000
0.000000
10.0
9921
201513179349308861.xml
311040228.0
0.000000
0.540167
149559.470000
1.000333
100.0
12048
201513179349305901.xml
166050703.0
1.000000
33.472201
-148963.563600
0.000000
220.0
18753
201513139349303196.xml
943152652.0
1.000000
254.451508
-7.144765
-250550.000000
250551.0
30433
201512599349300126.xml
232799695.0
0.985825
0.426822
2.387819
-54849.571430
202757.0
Content source: rbazaz/IRS-990-Analysis
Similar notebooks: