notebook.community

Edit and run



In [2]:

    
import csv
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from math import floor
import pandas as pd
import ClassiferHelperAPI as CH
import importlib
importlib.reload(CH)
from sklearn import metrics
from collections import OrderedDict
import math
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,mean_absolute_error, mean_squared_error,zero_one_loss



In [2]:

    
pd.DataFrame.from_csv("../FinalResults/rankListImages_expt2.csv")

with open("../FinalResults/rankListImages_expt2.csv","r") as inpFl:
    reader = csv.reader(inpFl)
    head = reader.__next__()
    data = [row for row in reader]



In [3]:

    
propCount = [float(row[4]) for row in data]
distribFnc = dict(Counter(propCount))
# df = pd.DataFrame(cdf,index=['Counter']).transpose()



In [4]:

    
# plotting histograms - RELEVANT, do not delete, but incorrect
bins = [0,10,20,30,40,50,60,70,80,90,100]

a,b,c = plt.hist(propCount,bins,histtype='bar')
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')



In [47]:

    
bucket = dict.fromkeys([i for i in range(0,110,10)],[])

for prop in propCount:
    key = math.ceil(prop/10) * 10
    bucket[key] = bucket.get(key,[]) + [prop] 

bucketCount = {}
for key in bucket:
    bucketCount[key] = len(bucket[key])
    
df = pd.DataFrame(bucketCount,index=['y']).transpose()
df.plot()
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.xticks(np.arange(0,100+1,10))
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')



In [71]:

    
df = pd.DataFrame.from_csv("../FinalResults/ImgShrRnkListWithTags.csv")
cols = list(df.columns)
df.drop('URL',1,inplace=True)
df.drop('Album',1,inplace=True)
df.reset_index(inplace=True)
df = df.iloc[np.random.permutation(len(df))]
df.to_csv("/tmp/test.csv",index=False)
reader = csv.reader(open("/tmp/test.csv","r"))
head = reader.__next__()
data = {}
for row in reader:
    temp = {}
    for i in range(1,len(row)):
        temp[head[i]] = row[i] 
    data[row[0]] = temp
    
allAttribs = CH.genHead(data,'SPECIES') + CH.genHead(data,'SEX') + CH.genHead(data,'AGE') + CH.genHead(data,'QUALITY') + CH.genHead(data,'VIEW_POINT') + CH.genHead(data,'INDIVIDUAL_NAME') + CH.genHead(data,'tags')



In [ ]:

    
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
clfDict = {}
performance_metric_dict = []
for method in methods:
    for i in np.arange(0.1,1,0.1):
        temp = OrderedDict()
        clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
        accScore = accuracy_score(test_y,predics)
        f1Score = f1_score(test_y,predics)
        precision = precision_score(test_y,predics)
        recall = recall_score(test_y,predics)
        auc = roc_auc_score(test_y,pred_prob)
        abserr = mean_absolute_error(test_y,predics)
        sqerr = mean_squared_error(test_y,predics)
        zerooneloss = zero_one_loss(test_y,predics)
        temp['Classifier'] = method
        temp['Train-Test Split'] = i * 100
        temp['Accuracy'] = accScore
        temp['Precision'] = precision
        temp['Recall'] = recall
        temp['AUC'] = auc
        temp['F1 score'] = f1Score
        temp['Squared Error'] = sqerr
        performance_metric_dict.append(temp)
        clfDict[(method,i)] = clf
        fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
        fl.write(classification_report(test_y,predics))
        
fl.close()









    



/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [74]:

    
df = pd.DataFrame(performance_metric_dict)

df = df[['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']]
df.to_csv("../ClassifierResults/perform_metrics_extrmClf.csv",index=False)
df









    Out[74]:






  
    
      
      Classifier
      Train-Test Split
      Accuracy
      Precision
      Recall
      F1 score
      AUC
      Absolute Error
      Squared Error
      0/1 loss
    
  
  
    
      0
      logistic
      10
      0.604651
      0.416667
      0.333333
      0.370370
      0.626190
      0.395349
      0.395349
      0.395349
    
    
      1
      logistic
      20
      0.611765
      0.478261
      0.343750
      0.400000
      0.657134
      0.388235
      0.388235
      0.388235
    
    
      2
      logistic
      30
      0.645669
      0.500000
      0.422222
      0.457831
      0.697154
      0.354331
      0.354331
      0.354331
    
    
      3
      logistic
      40
      0.674556
      0.547619
      0.389831
      0.455446
      0.723421
      0.325444
      0.325444
      0.325444
    
    
      4
      logistic
      50
      0.649289
      0.472222
      0.485714
      0.478873
      0.690578
      0.350711
      0.350711
      0.350711
    
    
      5
      logistic
      60
      0.665354
      0.521127
      0.420455
      0.465409
      0.694722
      0.334646
      0.334646
      0.334646
    
    
      6
      logistic
      70
      0.668919
      0.513889
      0.370000
      0.430233
      0.681122
      0.331081
      0.331081
      0.331081
    
    
      7
      logistic
      80
      0.639053
      0.444444
      0.318584
      0.371134
      0.626155
      0.360947
      0.360947
      0.360947
    
    
      8
      logistic
      90
      0.665789
      0.520548
      0.292308
      0.374384
      0.598508
      0.334211
      0.334211
      0.334211
    
    
      9
      svm
      10
      0.651163
      0.000000
      0.000000
      0.000000
      0.600000
      0.348837
      0.348837
      0.348837
    
    
      10
      svm
      20
      0.623529
      0.000000
      0.000000
      0.000000
      0.632960
      0.376471
      0.376471
      0.376471
    
    
      11
      svm
      30
      0.645669
      0.000000
      0.000000
      0.000000
      0.666802
      0.354331
      0.354331
      0.354331
    
    
      12
      svm
      40
      0.650888
      0.000000
      0.000000
      0.000000
      0.679045
      0.349112
      0.349112
      0.349112
    
    
      13
      svm
      50
      0.668246
      0.000000
      0.000000
      0.000000
      0.652685
      0.331754
      0.331754
      0.331754
    
    
      14
      svm
      60
      0.653543
      0.000000
      0.000000
      0.000000
      0.674185
      0.346457
      0.346457
      0.346457
    
    
      15
      svm
      70
      0.662162
      0.000000
      0.000000
      0.000000
      0.644031
      0.337838
      0.337838
      0.337838
    
    
      16
      svm
      80
      0.665680
      0.000000
      0.000000
      0.000000
      0.604503
      0.334320
      0.334320
      0.334320
    
    
      17
      svm
      90
      0.657895
      0.000000
      0.000000
      0.000000
      0.614385
      0.342105
      0.342105
      0.342105
    
    
      18
      dtree
      10
      0.511628
      0.250000
      0.200000
      0.222222
      0.439286
      0.488372
      0.488372
      0.488372
    
    
      19
      dtree
      20
      0.576471
      0.433333
      0.406250
      0.419355
      0.537146
      0.423529
      0.423529
      0.423529
    
    
      20
      dtree
      30
      0.629921
      0.472222
      0.377778
      0.419753
      0.573035
      0.370079
      0.370079
      0.370079
    
    
      21
      dtree
      40
      0.609467
      0.414634
      0.288136
      0.340000
      0.534977
      0.390533
      0.390533
      0.390533
    
    
      22
      dtree
      50
      0.630332
      0.451220
      0.528571
      0.486842
      0.607903
      0.369668
      0.369668
      0.369668
    
    
      23
      dtree
      60
      0.633858
      0.469136
      0.431818
      0.449704
      0.586391
      0.366142
      0.366142
      0.366142
    
    
      24
      dtree
      70
      0.652027
      0.477612
      0.320000
      0.383234
      0.570714
      0.347973
      0.347973
      0.347973
    
    
      25
      dtree
      80
      0.615385
      0.408602
      0.336283
      0.368932
      0.545919
      0.384615
      0.384615
      0.384615
    
    
      26
      dtree
      90
      0.650000
      0.470588
      0.184615
      0.265193
      0.538308
      0.350000
      0.350000
      0.350000
    
    
      27
      random_forests
      10
      0.604651
      0.375000
      0.200000
      0.260870
      0.669048
      0.395349
      0.395349
      0.395349
    
    
      28
      random_forests
      20
      0.588235
      0.434783
      0.312500
      0.363636
      0.665979
      0.411765
      0.411765
      0.411765
    
    
      29
      random_forests
      30
      0.614173
      0.416667
      0.222222
      0.289855
      0.622087
      0.385827
      0.385827
      0.385827
    
    
      30
      random_forests
      40
      0.662722
      0.533333
      0.271186
      0.359551
      0.715794
      0.337278
      0.337278
      0.337278
    
    
      31
      random_forests
      50
      0.639810
      0.451613
      0.400000
      0.424242
      0.695137
      0.360190
      0.360190
      0.360190
    
    
      32
      random_forests
      60
      0.645669
      0.479167
      0.261364
      0.338235
      0.591765
      0.354331
      0.354331
      0.354331
    
    
      33
      random_forests
      70
      0.652027
      0.466667
      0.210000
      0.289655
      0.636403
      0.347973
      0.347973
      0.347973
    
    
      34
      random_forests
      80
      0.615385
      0.345455
      0.168142
      0.226190
      0.538328
      0.384615
      0.384615
      0.384615
    
    
      35
      random_forests
      90
      0.673684
      0.560000
      0.215385
      0.311111
      0.613585
      0.326316
      0.326316
      0.326316



In [6]:

    
clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,0.4,80,'logistic',True)



In [84]:

    
for key in clfDict:
    clf = clfDict[key]
    
    if key[0] == 'random_forests':
        print(key, len(clf.estimators_))
        break
        
print(len(test_x.columns))









    



('random_forests', 0.20000000000000001) 835
835



In [65]:

    
d = OrderedDict.fromkeys(test_x.columns,0)

for i in range(len(clf.coef_[0])):
    d[test_x.columns[i]] = clf.coef_[0][i]



In [69]:

    
pd.DataFrame(d,index=['co-efficients']).transpose()









    Out[69]:






  
    
      
      co-efficients
    
  
  
    
      zebra_plains
      -0.468173
    
    
      giraffe_masai
      0.516168
    
    
      UNKNOWN NAME
      -0.043396
    
    
      Female
      0.498140
    
    
      UNKNOWN SEX
      -0.409351
    
    
      Male
      -0.210480
    
    
      unknown
      -0.406869
    
    
      juveniles - one year old
      -0.351691
    
    
      juveniles- two year old
      -0.268446
    
    
      adult
      0.034764
    
    
      infant
      0.125538
    
    
      excellent
      0.818747
    
    
      poor
      0.228112
    
    
      good
      0.917564
    
    
      ok
      0.414937
    
    
      junk
      -0.162535
    
    
      backright
      0.632304
    
    
      front
      0.112286
    
    
      frontleft
      0.594611
    
    
      right
      0.400426
    
    
      left
      -0.441930
    
    
      back
      -0.418995
    
    
      frontright
      0.184198
    
    
      backleft
      -0.916222
    
    
      IBEIS_PZ_1221
      0.000000
    
    
      IBEIS_PZ_0886
      0.391212
    
    
      IBEIS_PZ_1547
      0.000000
    
    
      IBEIS_PZ_0956
      0.000000
    
    
      IBEIS_PZ_1845
      0.000000
    
    
      NNP_GIRM_0116
      -0.196775
    
    
      ...
      ...
    
    
      sheep
      -0.293586
    
    
      arthropod
      0.000000
    
    
      elephant
      -0.147098
    
    
      lone
      -0.099475
    
    
      group
      -1.042633
    
    
      mountain
      0.001608
    
    
      laying
      0.000000
    
    
      animal
      0.345434
    
    
      dry
      -0.333251
    
    
      grazing
      -0.246223
    
    
      cactus
      -0.093977
    
    
      hill
      0.237482
    
    
      brown
      0.159295
    
    
      sky
      -0.304497
    
    
      mammal
      0.324114
    
    
      road
      0.920785
    
    
      wild
      -0.579443
    
    
      tall
      -0.742962
    
    
      giraffe
      0.224554
    
    
      brush
      0.000000
    
    
      eating
      -0.366870
    
    
      field
      0.081002
    
    
      dirt
      0.170823
    
    
      antelope
      -0.056490
    
    
      running
      0.000000
    
    
      bushes
      -0.259766
    
    
      mother
      0.000000
    
    
      path
      -0.020852
    
    
      standing
      0.638315
    
    
      outdoor
      -0.762959
    
  

832 rows × 1 columns



In [ ]:

	Classifier	Train-Test Split	Accuracy	Precision	Recall	F1 score	AUC	Absolute Error	Squared Error	0/1 loss
0	logistic	10	0.604651	0.416667	0.333333	0.370370	0.626190	0.395349	0.395349	0.395349
1	logistic	20	0.611765	0.478261	0.343750	0.400000	0.657134	0.388235	0.388235	0.388235
2	logistic	30	0.645669	0.500000	0.422222	0.457831	0.697154	0.354331	0.354331	0.354331
3	logistic	40	0.674556	0.547619	0.389831	0.455446	0.723421	0.325444	0.325444	0.325444
4	logistic	50	0.649289	0.472222	0.485714	0.478873	0.690578	0.350711	0.350711	0.350711
5	logistic	60	0.665354	0.521127	0.420455	0.465409	0.694722	0.334646	0.334646	0.334646
6	logistic	70	0.668919	0.513889	0.370000	0.430233	0.681122	0.331081	0.331081	0.331081
7	logistic	80	0.639053	0.444444	0.318584	0.371134	0.626155	0.360947	0.360947	0.360947
8	logistic	90	0.665789	0.520548	0.292308	0.374384	0.598508	0.334211	0.334211	0.334211
9	svm	10	0.651163	0.000000	0.000000	0.000000	0.600000	0.348837	0.348837	0.348837
10	svm	20	0.623529	0.000000	0.000000	0.000000	0.632960	0.376471	0.376471	0.376471
11	svm	30	0.645669	0.000000	0.000000	0.000000	0.666802	0.354331	0.354331	0.354331
12	svm	40	0.650888	0.000000	0.000000	0.000000	0.679045	0.349112	0.349112	0.349112
13	svm	50	0.668246	0.000000	0.000000	0.000000	0.652685	0.331754	0.331754	0.331754
14	svm	60	0.653543	0.000000	0.000000	0.000000	0.674185	0.346457	0.346457	0.346457
15	svm	70	0.662162	0.000000	0.000000	0.000000	0.644031	0.337838	0.337838	0.337838
16	svm	80	0.665680	0.000000	0.000000	0.000000	0.604503	0.334320	0.334320	0.334320
17	svm	90	0.657895	0.000000	0.000000	0.000000	0.614385	0.342105	0.342105	0.342105
18	dtree	10	0.511628	0.250000	0.200000	0.222222	0.439286	0.488372	0.488372	0.488372
19	dtree	20	0.576471	0.433333	0.406250	0.419355	0.537146	0.423529	0.423529	0.423529
20	dtree	30	0.629921	0.472222	0.377778	0.419753	0.573035	0.370079	0.370079	0.370079
21	dtree	40	0.609467	0.414634	0.288136	0.340000	0.534977	0.390533	0.390533	0.390533
22	dtree	50	0.630332	0.451220	0.528571	0.486842	0.607903	0.369668	0.369668	0.369668
23	dtree	60	0.633858	0.469136	0.431818	0.449704	0.586391	0.366142	0.366142	0.366142
24	dtree	70	0.652027	0.477612	0.320000	0.383234	0.570714	0.347973	0.347973	0.347973
25	dtree	80	0.615385	0.408602	0.336283	0.368932	0.545919	0.384615	0.384615	0.384615
26	dtree	90	0.650000	0.470588	0.184615	0.265193	0.538308	0.350000	0.350000	0.350000
27	random_forests	10	0.604651	0.375000	0.200000	0.260870	0.669048	0.395349	0.395349	0.395349
28	random_forests	20	0.588235	0.434783	0.312500	0.363636	0.665979	0.411765	0.411765	0.411765
29	random_forests	30	0.614173	0.416667	0.222222	0.289855	0.622087	0.385827	0.385827	0.385827
30	random_forests	40	0.662722	0.533333	0.271186	0.359551	0.715794	0.337278	0.337278	0.337278
31	random_forests	50	0.639810	0.451613	0.400000	0.424242	0.695137	0.360190	0.360190	0.360190
32	random_forests	60	0.645669	0.479167	0.261364	0.338235	0.591765	0.354331	0.354331	0.354331
33	random_forests	70	0.652027	0.466667	0.210000	0.289655	0.636403	0.347973	0.347973	0.347973
34	random_forests	80	0.615385	0.345455	0.168142	0.226190	0.538328	0.384615	0.384615	0.384615
35	random_forests	90	0.673684	0.560000	0.215385	0.311111	0.613585	0.326316	0.326316	0.326316

	co-efficients
zebra_plains	-0.468173
giraffe_masai	0.516168
UNKNOWN NAME	-0.043396
Female	0.498140
UNKNOWN SEX	-0.409351
Male	-0.210480
unknown	-0.406869
juveniles - one year old	-0.351691
juveniles- two year old	-0.268446
adult	0.034764
infant	0.125538
excellent	0.818747
poor	0.228112
good	0.917564
ok	0.414937
junk	-0.162535
backright	0.632304
front	0.112286
frontleft	0.594611
right	0.400426
left	-0.441930
back	-0.418995
frontright	0.184198
backleft	-0.916222
IBEIS_PZ_1221	0.000000
IBEIS_PZ_0886	0.391212
IBEIS_PZ_1547	0.000000
IBEIS_PZ_0956	0.000000
IBEIS_PZ_1845	0.000000
NNP_GIRM_0116	-0.196775
...	...
sheep	-0.293586
arthropod	0.000000
elephant	-0.147098
lone	-0.099475
group	-1.042633
mountain	0.001608
laying	0.000000
animal	0.345434
dry	-0.333251
grazing	-0.246223
cactus	-0.093977
hill	0.237482
brown	0.159295
sky	-0.304497
mammal	0.324114
road	0.920785
wild	-0.579443
tall	-0.742962
giraffe	0.224554
brush	0.000000
eating	-0.366870
field	0.081002
dirt	0.170823
antelope	-0.056490
running	0.000000
bushes	-0.259766
mother	0.000000
path	-0.020852
standing	0.638315
outdoor	-0.762959