In [1]:
import sys
sys.path.append("../script/")
import csv
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from math import floor
import pandas as pd
import ClassiferHelperAPI as CH
import importlib
importlib.reload(CH)
from sklearn import metrics
from collections import OrderedDict
import math
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,mean_absolute_error, mean_squared_error,zero_one_loss
In [2]:
pd.DataFrame.from_csv("../FinalResults/rankListImages_expt2.csv")
with open("../FinalResults/rankListImages_expt2.csv","r") as inpFl:
reader = csv.reader(inpFl)
head = reader.__next__()
data = [row for row in reader]
In [3]:
propCount = [float(row[4]) for row in data]
distribFnc = dict(Counter(propCount))
# df = pd.DataFrame(cdf,index=['Counter']).transpose()
In [4]:
# plotting histograms - RELEVANT, do not delete, but incorrect
bins = [0,10,20,30,40,50,60,70,80,90,100]
a,b,c = plt.hist(propCount,bins,histtype='bar')
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')
In [47]:
bucket = dict.fromkeys([i for i in range(0,110,10)],[])
for prop in propCount:
key = math.ceil(prop/10) * 10
bucket[key] = bucket.get(key,[]) + [prop]
bucketCount = {}
for key in bucket:
bucketCount[key] = len(bucket[key])
df = pd.DataFrame(bucketCount,index=['y']).transpose()
df.plot()
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.xticks(np.arange(0,100+1,10))
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')
In [37]:
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
perfMetricDict = {}
performance_metric_dict = []
for method in methods:
for i in np.arange(0.1,1,0.1):
temp = OrderedDict()
clf,test_x, test_y, predics, pred_prob = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
accScore = accuracy_score(test_y,predics)
f1Score = f1_score(test_y,predics)
precision = precision_score(test_y,predics)
recall = recall_score(test_y,predics)
auc = roc_auc_score(test_y,pred_prob)
abserr = mean_absolute_error(test_y,predics)
sqerr = mean_squared_error(test_y,predics)
zerooneloss = zero_one_loss(test_y,predics)
temp['Classifier'] = method
temp['Train-Test Split'] = i * 100
temp['Accuracy'] = accScore
temp['Precision'] = precision
temp['Recall'] = recall
temp['AUC'] = auc
temp['F1 score'] = f1Score
temp['Squared Error'] = sqerr
performance_metric_dict.append(temp)
clfDict[(method,i)] = clf
fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
fl.write(classification_report(test_y,predics))
fl.close()
In [38]:
df = pd.DataFrame(performance_metric_dict)
df = df[['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']]
df.to_csv("../ClassifierResults/perform_metrics_extrmClf1.csv",index=False)
df
Out[38]:
In [6]:
clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,0.4,80,'logistic',True)
In [88]:
for key in clfDict:
clf = clfDict[key]
if key[0] == 'random_forests':
print(key, clf.estimators_)
break
print(len(test_x.columns))
In [65]:
d = OrderedDict.fromkeys(test_x.columns,0)
for i in range(len(clf.coef_[0])):
d[test_x.columns[i]] = clf.coef_[0][i]
In [69]:
pd.DataFrame(d,index=['co-efficients']).transpose()
Out[69]:
In [34]:
d = CH.createDataFlDict(data,allAttribs,True,80,True)
In [35]:
# number of positive/negative examples:
posEgs = 0
negEgs = 0
for key in d.keys():
if d[key]['TARGET'] == 1:
posEgs += 1
else:
negEgs += 1
In [36]:
posEgs,negEgs
Out[36]:
In [19]:
print(len(list(filter(lambda x : float(data[x]['Proportion']) >= 80.0,data.keys()))))
print(len(list(filter(lambda x : float(data[x]['Proportion']) <= 20.0,data.keys()))))
In [20]:
230 + 192
Out[20]:
In [21]:
fDataKeys= list(filter(lambda x : float(data[x]['Proportion']) >= 80.0 or float(data[x]['Proportion']) <= 20.0,data.keys()))
fData = {gid: data[gid] for gid in fDataKeys}
In [30]:
pd.DataFrame(fData).transpose()['Proportion'].to_csv("/tmp/props.csv")
In [32]:
pd.DataFrame(d).transpose()['TARGET'].to_csv("/tmp/targs.csv")
In [50]:
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
perfMetricDict = {}
performance_metric_dict = []
for method in methods:
for i in np.arange(0.1,1,0.1):
temp = OrderedDict()
clf,test_x, test_y, predics, pred_prob = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
temp = CH.evalClassifierPerf(clf,test_x,test_y,predics,pred_prob)
temp['Method'] = method
temp['Train-Test Split'] = i * 100
performance_metric_dict.append(temp)
clfDict[(method,i)] = clf
fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
fl.write(classification_report(test_y,predics))
fl.close()
In [2]:
import mpl_toolkits as mt
In [5]:
from mpl_toolkits.basemap import Basemap
In [ ]: