In [1]:
import sys
sys.path.append("../script/")
import csv
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from math import floor
import pandas as pd
import ClassiferHelperAPI as CH
import importlib
importlib.reload(CH)
from sklearn import metrics
from collections import OrderedDict
import math
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,mean_absolute_error, mean_squared_error,zero_one_loss
In [2]:
pd.DataFrame.from_csv("../FinalResults/rankListImages_expt2.csv")
with open("../FinalResults/rankListImages_expt2.csv","r") as inpFl:
reader = csv.reader(inpFl)
head = reader.__next__()
data = [row for row in reader]
In [3]:
propCount = [float(row[4]) for row in data]
distribFnc = dict(Counter(propCount))
# df = pd.DataFrame(cdf,index=['Counter']).transpose()
In [4]:
# plotting histograms - RELEVANT, do not delete, but incorrect
bins = [0,10,20,30,40,50,60,70,80,90,100]
a,b,c = plt.hist(propCount,bins,histtype='bar')
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')
In [47]:
bucket = dict.fromkeys([i for i in range(0,110,10)],[])
for prop in propCount:
key = math.ceil(prop/10) * 10
bucket[key] = bucket.get(key,[]) + [prop]
bucketCount = {}
for key in bucket:
bucketCount[key] = len(bucket[key])
df = pd.DataFrame(bucketCount,index=['y']).transpose()
df.plot()
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.xticks(np.arange(0,100+1,10))
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')
In [ ]:
df = pd.DataFrame.from_csv("../FinalResults/ImgShrRnkListWithTags.csv")
cols = list(df.columns)
df.drop('URL',1,inplace=True)
df.drop('Album',1,inplace=True)
df.reset_index(inplace=True)
df = df.iloc[np.random.permutation(len(df))]
df.to_csv("/tmp/test.csv",index=False)
reader = csv.reader(open("/tmp/test.csv","r"))
head = reader.__next__()
data = {}
for row in reader:
temp = {}
for i in range(1,len(row)):
temp[head[i]] = row[i]
data[row[0]] = temp
allAttribs = CH.genHead(data,'SPECIES') + CH.genHead(data,'SEX') + CH.genHead(data,'AGE') + CH.genHead(data,'QUALITY') + CH.genHead(data,'VIEW_POINT') + CH.genHead(data,'INDIVIDUAL_NAME') + CH.genHead(data,'tags')
In [85]:
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
clfDict = {}
performance_metric_dict = []
for method in methods:
for i in np.arange(0.1,1,0.1):
temp = OrderedDict()
clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
accScore = accuracy_score(test_y,predics)
f1Score = f1_score(test_y,predics)
precision = precision_score(test_y,predics)
recall = recall_score(test_y,predics)
auc = roc_auc_score(test_y,pred_prob)
abserr = mean_absolute_error(test_y,predics)
sqerr = mean_squared_error(test_y,predics)
zerooneloss = zero_one_loss(test_y,predics)
temp['Classifier'] = method
temp['Train-Test Split'] = i * 100
temp['Accuracy'] = accScore
temp['Precision'] = precision
temp['Recall'] = recall
temp['AUC'] = auc
temp['F1 score'] = f1Score
temp['Squared Error'] = sqerr
performance_metric_dict.append(temp)
clfDict[(method,i)] = clf
fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
fl.write(classification_report(test_y,predics))
fl.close()
In [89]:
df = pd.DataFrame(performance_metric_dict)
df = df[['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']]
df.to_csv("../ClassifierResults/perform_metrics_extrmClf.csv",index=False)
df
Out[89]:
In [6]:
clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,0.4,80,'logistic',True)
In [88]:
for key in clfDict:
clf = clfDict[key]
if key[0] == 'random_forests':
print(key, clf.estimators_)
break
print(len(test_x.columns))
In [65]:
d = OrderedDict.fromkeys(test_x.columns,0)
for i in range(len(clf.coef_[0])):
d[test_x.columns[i]] = clf.coef_[0][i]
In [69]:
pd.DataFrame(d,index=['co-efficients']).transpose()
Out[69]:
In [ ]: