In [2]:
import csv
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from math import floor
import pandas as pd
import ClassiferHelperAPI as CH
import importlib
importlib.reload(CH)
from sklearn import metrics
from collections import OrderedDict
import math
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,mean_absolute_error, mean_squared_error,zero_one_loss

In [2]:
pd.DataFrame.from_csv("../FinalResults/rankListImages_expt2.csv")

with open("../FinalResults/rankListImages_expt2.csv","r") as inpFl:
    reader = csv.reader(inpFl)
    head = reader.__next__()
    data = [row for row in reader]

In [3]:
propCount = [float(row[4]) for row in data]
distribFnc = dict(Counter(propCount))
# df = pd.DataFrame(cdf,index=['Counter']).transpose()

In [4]:
# plotting histograms - RELEVANT, do not delete, but incorrect
bins = [0,10,20,30,40,50,60,70,80,90,100]

a,b,c = plt.hist(propCount,bins,histtype='bar')
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')

In [47]:
bucket = dict.fromkeys([i for i in range(0,110,10)],[])

for prop in propCount:
    key = math.ceil(prop/10) * 10
    bucket[key] = bucket.get(key,[]) + [prop] 

bucketCount = {}
for key in bucket:
    bucketCount[key] = len(bucket[key])
    
df = pd.DataFrame(bucketCount,index=['y']).transpose()
df.plot()
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.xticks(np.arange(0,100+1,10))
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')

In [71]:
df = pd.DataFrame.from_csv("../FinalResults/ImgShrRnkListWithTags.csv")
cols = list(df.columns)
df.drop('URL',1,inplace=True)
df.drop('Album',1,inplace=True)
df.reset_index(inplace=True)
df = df.iloc[np.random.permutation(len(df))]
df.to_csv("/tmp/test.csv",index=False)
reader = csv.reader(open("/tmp/test.csv","r"))
head = reader.__next__()
data = {}
for row in reader:
    temp = {}
    for i in range(1,len(row)):
        temp[head[i]] = row[i] 
    data[row[0]] = temp
    
allAttribs = CH.genHead(data,'SPECIES') + CH.genHead(data,'SEX') + CH.genHead(data,'AGE') + CH.genHead(data,'QUALITY') + CH.genHead(data,'VIEW_POINT') + CH.genHead(data,'INDIVIDUAL_NAME') + CH.genHead(data,'tags')

In [ ]:
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
clfDict = {}
performance_metric_dict = []
for method in methods:
    for i in np.arange(0.1,1,0.1):
        temp = OrderedDict()
        clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
        accScore = accuracy_score(test_y,predics)
        f1Score = f1_score(test_y,predics)
        precision = precision_score(test_y,predics)
        recall = recall_score(test_y,predics)
        auc = roc_auc_score(test_y,pred_prob)
        abserr = mean_absolute_error(test_y,predics)
        sqerr = mean_squared_error(test_y,predics)
        zerooneloss = zero_one_loss(test_y,predics)
        temp['Classifier'] = method
        temp['Train-Test Split'] = i * 100
        temp['Accuracy'] = accScore
        temp['Precision'] = precision
        temp['Recall'] = recall
        temp['AUC'] = auc
        temp['F1 score'] = f1Score
        temp['Squared Error'] = sqerr
        performance_metric_dict.append(temp)
        clfDict[(method,i)] = clf
        fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
        fl.write(classification_report(test_y,predics))
        
fl.close()


/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [74]:
df = pd.DataFrame(performance_metric_dict)

df = df[['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']]
df.to_csv("../ClassifierResults/perform_metrics_extrmClf.csv",index=False)
df


Out[74]:
Classifier Train-Test Split Accuracy Precision Recall F1 score AUC Absolute Error Squared Error 0/1 loss
0 logistic 10 0.604651 0.416667 0.333333 0.370370 0.626190 0.395349 0.395349 0.395349
1 logistic 20 0.611765 0.478261 0.343750 0.400000 0.657134 0.388235 0.388235 0.388235
2 logistic 30 0.645669 0.500000 0.422222 0.457831 0.697154 0.354331 0.354331 0.354331
3 logistic 40 0.674556 0.547619 0.389831 0.455446 0.723421 0.325444 0.325444 0.325444
4 logistic 50 0.649289 0.472222 0.485714 0.478873 0.690578 0.350711 0.350711 0.350711
5 logistic 60 0.665354 0.521127 0.420455 0.465409 0.694722 0.334646 0.334646 0.334646
6 logistic 70 0.668919 0.513889 0.370000 0.430233 0.681122 0.331081 0.331081 0.331081
7 logistic 80 0.639053 0.444444 0.318584 0.371134 0.626155 0.360947 0.360947 0.360947
8 logistic 90 0.665789 0.520548 0.292308 0.374384 0.598508 0.334211 0.334211 0.334211
9 svm 10 0.651163 0.000000 0.000000 0.000000 0.600000 0.348837 0.348837 0.348837
10 svm 20 0.623529 0.000000 0.000000 0.000000 0.632960 0.376471 0.376471 0.376471
11 svm 30 0.645669 0.000000 0.000000 0.000000 0.666802 0.354331 0.354331 0.354331
12 svm 40 0.650888 0.000000 0.000000 0.000000 0.679045 0.349112 0.349112 0.349112
13 svm 50 0.668246 0.000000 0.000000 0.000000 0.652685 0.331754 0.331754 0.331754
14 svm 60 0.653543 0.000000 0.000000 0.000000 0.674185 0.346457 0.346457 0.346457
15 svm 70 0.662162 0.000000 0.000000 0.000000 0.644031 0.337838 0.337838 0.337838
16 svm 80 0.665680 0.000000 0.000000 0.000000 0.604503 0.334320 0.334320 0.334320
17 svm 90 0.657895 0.000000 0.000000 0.000000 0.614385 0.342105 0.342105 0.342105
18 dtree 10 0.511628 0.250000 0.200000 0.222222 0.439286 0.488372 0.488372 0.488372
19 dtree 20 0.576471 0.433333 0.406250 0.419355 0.537146 0.423529 0.423529 0.423529
20 dtree 30 0.629921 0.472222 0.377778 0.419753 0.573035 0.370079 0.370079 0.370079
21 dtree 40 0.609467 0.414634 0.288136 0.340000 0.534977 0.390533 0.390533 0.390533
22 dtree 50 0.630332 0.451220 0.528571 0.486842 0.607903 0.369668 0.369668 0.369668
23 dtree 60 0.633858 0.469136 0.431818 0.449704 0.586391 0.366142 0.366142 0.366142
24 dtree 70 0.652027 0.477612 0.320000 0.383234 0.570714 0.347973 0.347973 0.347973
25 dtree 80 0.615385 0.408602 0.336283 0.368932 0.545919 0.384615 0.384615 0.384615
26 dtree 90 0.650000 0.470588 0.184615 0.265193 0.538308 0.350000 0.350000 0.350000
27 random_forests 10 0.604651 0.375000 0.200000 0.260870 0.669048 0.395349 0.395349 0.395349
28 random_forests 20 0.588235 0.434783 0.312500 0.363636 0.665979 0.411765 0.411765 0.411765
29 random_forests 30 0.614173 0.416667 0.222222 0.289855 0.622087 0.385827 0.385827 0.385827
30 random_forests 40 0.662722 0.533333 0.271186 0.359551 0.715794 0.337278 0.337278 0.337278
31 random_forests 50 0.639810 0.451613 0.400000 0.424242 0.695137 0.360190 0.360190 0.360190
32 random_forests 60 0.645669 0.479167 0.261364 0.338235 0.591765 0.354331 0.354331 0.354331
33 random_forests 70 0.652027 0.466667 0.210000 0.289655 0.636403 0.347973 0.347973 0.347973
34 random_forests 80 0.615385 0.345455 0.168142 0.226190 0.538328 0.384615 0.384615 0.384615
35 random_forests 90 0.673684 0.560000 0.215385 0.311111 0.613585 0.326316 0.326316 0.326316

In [6]:
clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,0.4,80,'logistic',True)

In [84]:
for key in clfDict:
    clf = clfDict[key]
    
    if key[0] == 'random_forests':
        print(key, len(clf.estimators_))
        break
        
print(len(test_x.columns))


('random_forests', 0.20000000000000001) 835
835

In [65]:
d = OrderedDict.fromkeys(test_x.columns,0)

for i in range(len(clf.coef_[0])):
    d[test_x.columns[i]] = clf.coef_[0][i]

In [69]:
pd.DataFrame(d,index=['co-efficients']).transpose()


Out[69]:
co-efficients
zebra_plains -0.468173
giraffe_masai 0.516168
UNKNOWN NAME -0.043396
Female 0.498140
UNKNOWN SEX -0.409351
Male -0.210480
unknown -0.406869
juveniles - one year old -0.351691
juveniles- two year old -0.268446
adult 0.034764
infant 0.125538
excellent 0.818747
poor 0.228112
good 0.917564
ok 0.414937
junk -0.162535
backright 0.632304
front 0.112286
frontleft 0.594611
right 0.400426
left -0.441930
back -0.418995
frontright 0.184198
backleft -0.916222
IBEIS_PZ_1221 0.000000
IBEIS_PZ_0886 0.391212
IBEIS_PZ_1547 0.000000
IBEIS_PZ_0956 0.000000
IBEIS_PZ_1845 0.000000
NNP_GIRM_0116 -0.196775
... ...
sheep -0.293586
arthropod 0.000000
elephant -0.147098
lone -0.099475
group -1.042633
mountain 0.001608
laying 0.000000
animal 0.345434
dry -0.333251
grazing -0.246223
cactus -0.093977
hill 0.237482
brown 0.159295
sky -0.304497
mammal 0.324114
road 0.920785
wild -0.579443
tall -0.742962
giraffe 0.224554
brush 0.000000
eating -0.366870
field 0.081002
dirt 0.170823
antelope -0.056490
running 0.000000
bushes -0.259766
mother 0.000000
path -0.020852
standing 0.638315
outdoor -0.762959

832 rows × 1 columns


In [ ]: