In [1]:
import pandas as pd
df1=pd.read_csv('team_out_1.csv')
df2=pd.read_csv('team_out_a2.csv')
df3=pd.read_csv('team_out_a3.csv')
df4=pd.read_csv('team_out_Yash.csv')
df5=pd.read_csv('team_out_Yash_part1.csv')
df=df1.append(df2)
df=df.append(df3)
df=df.append(df4)
df=df.append(df5)
df.dropna(inplace=True)
df.reset_index(inplace=True,drop=True)
df=df[df.Total_Expenses>0]
df
Out[1]:
In [634]:
df[df.Program_Exp>1]
Out[634]:
In [2]:
small_df=df[df.Total_Expenses<1000000]
med_df=df[df.Total_Expenses>1000000]
med_df=med_df[df.Total_Expenses<10000000]
large_df=df[df.Total_Expenses<50000000]
large_df=large_df[df.Total_Expenses>10000000]
national_df=df[df.Total_Expenses>50000000]
In [28]:
print(len(large_df))
pos_med=large_df[large_df.Program_Exp>.75]
# pos_med=pos_med[pos_med.Working_Capital>.01]
pos_med=pos_med[pos_med.Liabilities_To_Asset<1]
# pos_med=pos_med[pos_med.Surplus_Margin>.01]
lst_temp=list(pos_med['EIN'])
print("%:",len(lst_temp)/len(large_df))
print("NUMBER OF POSITIVE: ",len(lst_temp))
In [3]:
print("NUMBER OF MED: ",len(med_df))
pos_med=med_df[med_df.Program_Exp>.75]
pos_med=pos_med[pos_med.Working_Capital>.01]
pos_med=pos_med[pos_med.Liabilities_To_Asset<1]
pos_med=pos_med[pos_med.Surplus_Margin>.01]
#
lst_temp=list(pos_med['EIN'])
print("%:",len(lst_temp)/len(med_df))
print("NUMBER OF POSITIVE: ",len(lst_temp))
In [36]:
eff_nat_df=national_df[national_df.Program_Exp>.8]
# eff_nat_df=eff_nat_df[eff_nat_df.Working_Capital>.1]
eff_nat_df=eff_nat_df[eff_nat_df.Liabilities_To_Asset<1]
# eff_nat_df=eff_nat_df[eff_nat_df.Surplus_Margin>.1]
print(len(national_df))
lst_temp=list(eff_nat_df['EIN'])
print("%:",len(lst_temp)/len(national_df))
print(len(lst_temp))
In [18]:
print("NUMBER OF SMALL: ",len(small_df))
pos_small=small_df[small_df.Program_Exp>.5]
# pos_small=pos_small[pos_small.Working_Capital>.5]
pos_small=pos_small[pos_small.Liabilities_To_Asset<.5]
# pos_small=pos_small[pos_small.Surplus_Margin>.2]
lst_temp=list(pos_small['EIN'])
print("%:",len(lst_temp)/len(small_df))
print("NUMBER OF POSITIVE: ",len(lst_temp))
In [37]:
df=national_df #CHANGE THIS TO REQUIRED SIZE AND RUN REST OF THE CODE AS IS
In [38]:
df.reset_index(drop=True,inplace=True)
norm_df=df.copy()
norm_df=norm_df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin','Total_Expenses']]
from sklearn import preprocessing
x = norm_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
norm_df = pd.DataFrame(x_scaled)
norm_df["Filename"]=df['Filename']
norm_df["EIN"]=df['EIN']
norm_df.columns=['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin','Total_Expenses','Filename','EIN']
norm_df.set_index('EIN',inplace=True)
norm_df
Out[38]:
In [39]:
Y_class_df = pd.DataFrame()
X_class_df=norm_df.loc[lst_temp]
X_class_df['Efficiency'] = 1
Y_class_df['Efficiency'] = X_class_df['Efficiency']
X_class_df.drop('Efficiency', axis=1, inplace=True)
new_df=norm_df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin']]
X_class_df=X_class_df[['Program_Exp','Liabilities_To_Asset','Working_Capital','Surplus_Margin']]
# X_class_df=X_class_df.drop(X_class_df.index[2]) #OUTLIER REMOVER
X_class_df.reset_index(inplace=True,drop=True)
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm
from scipy import stats
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=7)
reduced_df = svd.fit_transform(new_df)
#svd2 = TruncatedSVD(n_components=2, n_iter=7)
X_classtrain_df = svd.fit_transform(X_class_df)
'''outliers_fraction = 0.25'''
#colors = ['m', 'g', 'b']
clf = svm.OneClassSVM(nu=0.1, kernel="linear", gamma=.01,coef0=1.5)
clf.fit(X_classtrain_df)
#y_pred_test = clf.predict(normalized_df)
#print(y_pred_test)
Z1 = clf.predict(reduced_df)
res_matrix=Z1
print(res_matrix)
#print(reduced_df.shape)
if Z1.shape[0]%2==1:
Z1=Z1[:-1]
temp_Z1=Z1
Z1 = Z1.reshape((-1,2))
print(Z1.shape)
xx1 = []
yy1= []
for i in reduced_df:
xx1.append(i[0])
yy1.append(i[1])
x1 = np.asarray(xx1)
y1 = np.asarray(yy1)
temp_y1=y1
temp_x1=x1
if len(x1)%2==1: #IS ODD:
x1=x1[:-1]
if len(y1)%2==1:
y1=y1[:-1]
x1 = x1.reshape((-1,2))
y1 = y1.reshape((-1,2))
print(x1.shape,y1.shape)
# plt.figure(0)
# plt.contourf(reduced_df[0:Z1.shape[0]], reduced_df[Z1.shape[0]:len(res_matrix)-1], Z1,cmap=plt.cm.coolwarm)
# plt.figure(1)
# plt.contourf(x1, y1, Z1)
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ax.contourf(x1,y1,Z1)
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ax.contour(reduced_df[0:Z1.shape[0]], reduced_df[Z1.shape[0]:len(res_matrix)-1], Z1,cmap=plt.cm.coolwarm)
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ax.plot_trisurf(temp_x1,temp_y1, temp_Z1,cmap=plt.cm.coolwarm)
# # #plt.scatter(X_class_df.as_matrix()[:, 0], X_class_df.as_matrix()[:, 1], color='black')
In [40]:
new_df=norm_df.copy()
new_df.reset_index(inplace=True)
temp_list=[]
count=0
for i in range(len(res_matrix)):
if res_matrix[i]==-1.0:
temp_list.append(pd.DataFrame(df.loc[i]).transpose())
count+=1
print(count)
print(count/len(norm_df))
ineff_nat_df=pd.concat(temp_list)
ineff_nat_df.reset_index(inplace=True,drop=True)
ineff_nat_df
Out[40]: