notebook.community

Edit and run



In [1]:

    
import sys
sys.path.append("../script/")
import csv
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from math import floor
import pandas as pd
import ClassiferHelperAPI as CH
import importlib
importlib.reload(CH)
from sklearn import metrics
from collections import OrderedDict
import math
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,mean_absolute_error, mean_squared_error,zero_one_loss



In [2]:

    
pd.DataFrame.from_csv("../FinalResults/rankListImages_expt2.csv")

with open("../FinalResults/rankListImages_expt2.csv","r") as inpFl:
    reader = csv.reader(inpFl)
    head = reader.__next__()
    data = [row for row in reader]



In [3]:

    
propCount = [float(row[4]) for row in data]
distribFnc = dict(Counter(propCount))
# df = pd.DataFrame(cdf,index=['Counter']).transpose()



In [4]:

    
# plotting histograms - RELEVANT, do not delete, but incorrect
bins = [0,10,20,30,40,50,60,70,80,90,100]

a,b,c = plt.hist(propCount,bins,histtype='bar')
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')



In [47]:

    
bucket = dict.fromkeys([i for i in range(0,110,10)],[])

for prop in propCount:
    key = math.ceil(prop/10) * 10
    bucket[key] = bucket.get(key,[]) + [prop] 

bucketCount = {}
for key in bucket:
    bucketCount[key] = len(bucket[key])
    
df = pd.DataFrame(bucketCount,index=['y']).transpose()
df.plot()
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.xticks(np.arange(0,100+1,10))
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')



In [ ]:

    
df = pd.DataFrame.from_csv("../FinalResults/ImgShrRnkListWithTags.csv")
cols = list(df.columns)
df.drop('URL',1,inplace=True)
df.drop('Album',1,inplace=True)
df.reset_index(inplace=True)
df = df.iloc[np.random.permutation(len(df))]
df.to_csv("/tmp/test.csv",index=False)
reader = csv.reader(open("/tmp/test.csv","r"))
head = reader.__next__()
data = {}
for row in reader:
    temp = {}
    for i in range(1,len(row)):
        temp[head[i]] = row[i] 
    data[row[0]] = temp
    
allAttribs = CH.genHead(data,'SPECIES') + CH.genHead(data,'SEX') + CH.genHead(data,'AGE') + CH.genHead(data,'QUALITY') + CH.genHead(data,'VIEW_POINT') + CH.genHead(data,'INDIVIDUAL_NAME') + CH.genHead(data,'tags')



In [85]:

    
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
clfDict = {}
performance_metric_dict = []
for method in methods:
    for i in np.arange(0.1,1,0.1):
        temp = OrderedDict()
        clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
        accScore = accuracy_score(test_y,predics)
        f1Score = f1_score(test_y,predics)
        precision = precision_score(test_y,predics)
        recall = recall_score(test_y,predics)
        auc = roc_auc_score(test_y,pred_prob)
        abserr = mean_absolute_error(test_y,predics)
        sqerr = mean_squared_error(test_y,predics)
        zerooneloss = zero_one_loss(test_y,predics)
        temp['Classifier'] = method
        temp['Train-Test Split'] = i * 100
        temp['Accuracy'] = accScore
        temp['Precision'] = precision
        temp['Recall'] = recall
        temp['AUC'] = auc
        temp['F1 score'] = f1Score
        temp['Squared Error'] = sqerr
        performance_metric_dict.append(temp)
        clfDict[(method,i)] = clf
        fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
        fl.write(classification_report(test_y,predics))
        
fl.close()









    



/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [89]:

    
df = pd.DataFrame(performance_metric_dict)

df = df[['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']]
df.to_csv("../ClassifierResults/perform_metrics_extrmClf.csv",index=False)
df









    Out[89]:






  
    
      
      Classifier
      Train-Test Split
      Accuracy
      Precision
      Recall
      F1 score
      AUC
      Squared Error
    
  
  
    
      0
      logistic
      10
      0.604651
      0.416667
      0.333333
      0.370370
      0.626190
      0.395349
    
    
      1
      logistic
      20
      0.611765
      0.478261
      0.343750
      0.400000
      0.657134
      0.388235
    
    
      2
      logistic
      30
      0.645669
      0.500000
      0.422222
      0.457831
      0.697154
      0.354331
    
    
      3
      logistic
      40
      0.674556
      0.547619
      0.389831
      0.455446
      0.723421
      0.325444
    
    
      4
      logistic
      50
      0.649289
      0.472222
      0.485714
      0.478873
      0.690578
      0.350711
    
    
      5
      logistic
      60
      0.665354
      0.521127
      0.420455
      0.465409
      0.694722
      0.334646
    
    
      6
      logistic
      70
      0.668919
      0.513889
      0.370000
      0.430233
      0.681122
      0.331081
    
    
      7
      logistic
      80
      0.639053
      0.444444
      0.318584
      0.371134
      0.626155
      0.360947
    
    
      8
      logistic
      90
      0.665789
      0.520548
      0.292308
      0.374384
      0.598508
      0.334211
    
    
      9
      svm
      10
      0.651163
      0.000000
      0.000000
      0.000000
      0.600000
      0.348837
    
    
      10
      svm
      20
      0.623529
      0.000000
      0.000000
      0.000000
      0.632960
      0.376471
    
    
      11
      svm
      30
      0.645669
      0.000000
      0.000000
      0.000000
      0.666802
      0.354331
    
    
      12
      svm
      40
      0.650888
      0.000000
      0.000000
      0.000000
      0.679122
      0.349112
    
    
      13
      svm
      50
      0.668246
      0.000000
      0.000000
      0.000000
      0.652584
      0.331754
    
    
      14
      svm
      60
      0.653543
      0.000000
      0.000000
      0.000000
      0.674117
      0.346457
    
    
      15
      svm
      70
      0.662162
      0.000000
      0.000000
      0.000000
      0.643929
      0.337838
    
    
      16
      svm
      80
      0.665680
      0.000000
      0.000000
      0.000000
      0.604503
      0.334320
    
    
      17
      svm
      90
      0.657895
      0.000000
      0.000000
      0.000000
      0.614431
      0.342105
    
    
      18
      dtree
      10
      0.441860
      0.153846
      0.133333
      0.142857
      0.370238
      0.558140
    
    
      19
      dtree
      20
      0.600000
      0.466667
      0.437500
      0.451613
      0.567807
      0.400000
    
    
      20
      dtree
      30
      0.637795
      0.487179
      0.422222
      0.452381
      0.578591
      0.362205
    
    
      21
      dtree
      40
      0.621302
      0.441860
      0.322034
      0.372549
      0.548844
      0.378698
    
    
      22
      dtree
      50
      0.639810
      0.464286
      0.557143
      0.506494
      0.622290
      0.360190
    
    
      23
      dtree
      60
      0.614173
      0.437500
      0.397727
      0.416667
      0.563321
      0.385827
    
    
      24
      dtree
      70
      0.621622
      0.421053
      0.320000
      0.363636
      0.547755
      0.378378
    
    
      25
      dtree
      80
      0.618343
      0.413043
      0.336283
      0.370732
      0.548142
      0.381657
    
    
      26
      dtree
      90
      0.676316
      0.559322
      0.253846
      0.349206
      0.574923
      0.323684
    
    
      27
      random_forests
      10
      0.558140
      0.166667
      0.066667
      0.095238
      0.598810
      0.441860
    
    
      28
      random_forests
      20
      0.611765
      0.473684
      0.281250
      0.352941
      0.656840
      0.388235
    
    
      29
      random_forests
      30
      0.582677
      0.388889
      0.311111
      0.345679
      0.619648
      0.417323
    
    
      30
      random_forests
      40
      0.603550
      0.346154
      0.152542
      0.211765
      0.608166
      0.396450
    
    
      31
      random_forests
      50
      0.611374
      0.403226
      0.357143
      0.378788
      0.614083
      0.388626
    
    
      32
      random_forests
      60
      0.665354
      0.529412
      0.306818
      0.388489
      0.669633
      0.334646
    
    
      33
      random_forests
      70
      0.668919
      0.527778
      0.190000
      0.279412
      0.617194
      0.331081
    
    
      34
      random_forests
      80
      0.621302
      0.394366
      0.247788
      0.304348
      0.545782
      0.378698
    
    
      35
      random_forests
      90
      0.655263
      0.492537
      0.253846
      0.335025
      0.575062
      0.344737



In [6]:

    
clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,0.4,80,'logistic',True)



In [88]:

    
for key in clfDict:
    clf = clfDict[key]
    
    if key[0] == 'random_forests':
        print(key, clf.estimators_)
        break
        
print(len(test_x.columns))









    



('random_forests', 0.20000000000000001) [DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1865640145, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=274807701, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1148070046, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=894216041, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1570411674, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=668350761, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=901181377, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=707182900, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1860867144, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=793565307, splitter='best')]
835



In [65]:

    
d = OrderedDict.fromkeys(test_x.columns,0)

for i in range(len(clf.coef_[0])):
    d[test_x.columns[i]] = clf.coef_[0][i]



In [69]:

    
pd.DataFrame(d,index=['co-efficients']).transpose()









    Out[69]:






  
    
      
      co-efficients
    
  
  
    
      zebra_plains
      -0.468173
    
    
      giraffe_masai
      0.516168
    
    
      UNKNOWN NAME
      -0.043396
    
    
      Female
      0.498140
    
    
      UNKNOWN SEX
      -0.409351
    
    
      Male
      -0.210480
    
    
      unknown
      -0.406869
    
    
      juveniles - one year old
      -0.351691
    
    
      juveniles- two year old
      -0.268446
    
    
      adult
      0.034764
    
    
      infant
      0.125538
    
    
      excellent
      0.818747
    
    
      poor
      0.228112
    
    
      good
      0.917564
    
    
      ok
      0.414937
    
    
      junk
      -0.162535
    
    
      backright
      0.632304
    
    
      front
      0.112286
    
    
      frontleft
      0.594611
    
    
      right
      0.400426
    
    
      left
      -0.441930
    
    
      back
      -0.418995
    
    
      frontright
      0.184198
    
    
      backleft
      -0.916222
    
    
      IBEIS_PZ_1221
      0.000000
    
    
      IBEIS_PZ_0886
      0.391212
    
    
      IBEIS_PZ_1547
      0.000000
    
    
      IBEIS_PZ_0956
      0.000000
    
    
      IBEIS_PZ_1845
      0.000000
    
    
      NNP_GIRM_0116
      -0.196775
    
    
      ...
      ...
    
    
      sheep
      -0.293586
    
    
      arthropod
      0.000000
    
    
      elephant
      -0.147098
    
    
      lone
      -0.099475
    
    
      group
      -1.042633
    
    
      mountain
      0.001608
    
    
      laying
      0.000000
    
    
      animal
      0.345434
    
    
      dry
      -0.333251
    
    
      grazing
      -0.246223
    
    
      cactus
      -0.093977
    
    
      hill
      0.237482
    
    
      brown
      0.159295
    
    
      sky
      -0.304497
    
    
      mammal
      0.324114
    
    
      road
      0.920785
    
    
      wild
      -0.579443
    
    
      tall
      -0.742962
    
    
      giraffe
      0.224554
    
    
      brush
      0.000000
    
    
      eating
      -0.366870
    
    
      field
      0.081002
    
    
      dirt
      0.170823
    
    
      antelope
      -0.056490
    
    
      running
      0.000000
    
    
      bushes
      -0.259766
    
    
      mother
      0.000000
    
    
      path
      -0.020852
    
    
      standing
      0.638315
    
    
      outdoor
      -0.762959
    
  

832 rows × 1 columns



In [ ]:

	Classifier	Train-Test Split	Accuracy	Precision	Recall	F1 score	AUC	Squared Error
0	logistic	10	0.604651	0.416667	0.333333	0.370370	0.626190	0.395349
1	logistic	20	0.611765	0.478261	0.343750	0.400000	0.657134	0.388235
2	logistic	30	0.645669	0.500000	0.422222	0.457831	0.697154	0.354331
3	logistic	40	0.674556	0.547619	0.389831	0.455446	0.723421	0.325444
4	logistic	50	0.649289	0.472222	0.485714	0.478873	0.690578	0.350711
5	logistic	60	0.665354	0.521127	0.420455	0.465409	0.694722	0.334646
6	logistic	70	0.668919	0.513889	0.370000	0.430233	0.681122	0.331081
7	logistic	80	0.639053	0.444444	0.318584	0.371134	0.626155	0.360947
8	logistic	90	0.665789	0.520548	0.292308	0.374384	0.598508	0.334211
9	svm	10	0.651163	0.000000	0.000000	0.000000	0.600000	0.348837
10	svm	20	0.623529	0.000000	0.000000	0.000000	0.632960	0.376471
11	svm	30	0.645669	0.000000	0.000000	0.000000	0.666802	0.354331
12	svm	40	0.650888	0.000000	0.000000	0.000000	0.679122	0.349112
13	svm	50	0.668246	0.000000	0.000000	0.000000	0.652584	0.331754
14	svm	60	0.653543	0.000000	0.000000	0.000000	0.674117	0.346457
15	svm	70	0.662162	0.000000	0.000000	0.000000	0.643929	0.337838
16	svm	80	0.665680	0.000000	0.000000	0.000000	0.604503	0.334320
17	svm	90	0.657895	0.000000	0.000000	0.000000	0.614431	0.342105
18	dtree	10	0.441860	0.153846	0.133333	0.142857	0.370238	0.558140
19	dtree	20	0.600000	0.466667	0.437500	0.451613	0.567807	0.400000
20	dtree	30	0.637795	0.487179	0.422222	0.452381	0.578591	0.362205
21	dtree	40	0.621302	0.441860	0.322034	0.372549	0.548844	0.378698
22	dtree	50	0.639810	0.464286	0.557143	0.506494	0.622290	0.360190
23	dtree	60	0.614173	0.437500	0.397727	0.416667	0.563321	0.385827
24	dtree	70	0.621622	0.421053	0.320000	0.363636	0.547755	0.378378
25	dtree	80	0.618343	0.413043	0.336283	0.370732	0.548142	0.381657
26	dtree	90	0.676316	0.559322	0.253846	0.349206	0.574923	0.323684
27	random_forests	10	0.558140	0.166667	0.066667	0.095238	0.598810	0.441860
28	random_forests	20	0.611765	0.473684	0.281250	0.352941	0.656840	0.388235
29	random_forests	30	0.582677	0.388889	0.311111	0.345679	0.619648	0.417323
30	random_forests	40	0.603550	0.346154	0.152542	0.211765	0.608166	0.396450
31	random_forests	50	0.611374	0.403226	0.357143	0.378788	0.614083	0.388626
32	random_forests	60	0.665354	0.529412	0.306818	0.388489	0.669633	0.334646
33	random_forests	70	0.668919	0.527778	0.190000	0.279412	0.617194	0.331081
34	random_forests	80	0.621302	0.394366	0.247788	0.304348	0.545782	0.378698
35	random_forests	90	0.655263	0.492537	0.253846	0.335025	0.575062	0.344737

	co-efficients
zebra_plains	-0.468173
giraffe_masai	0.516168
UNKNOWN NAME	-0.043396
Female	0.498140
UNKNOWN SEX	-0.409351
Male	-0.210480
unknown	-0.406869
juveniles - one year old	-0.351691
juveniles- two year old	-0.268446
adult	0.034764
infant	0.125538
excellent	0.818747
poor	0.228112
good	0.917564
ok	0.414937
junk	-0.162535
backright	0.632304
front	0.112286
frontleft	0.594611
right	0.400426
left	-0.441930
back	-0.418995
frontright	0.184198
backleft	-0.916222
IBEIS_PZ_1221	0.000000
IBEIS_PZ_0886	0.391212
IBEIS_PZ_1547	0.000000
IBEIS_PZ_0956	0.000000
IBEIS_PZ_1845	0.000000
NNP_GIRM_0116	-0.196775
...	...
sheep	-0.293586
arthropod	0.000000
elephant	-0.147098
lone	-0.099475
group	-1.042633
mountain	0.001608
laying	0.000000
animal	0.345434
dry	-0.333251
grazing	-0.246223
cactus	-0.093977
hill	0.237482
brown	0.159295
sky	-0.304497
mammal	0.324114
road	0.920785
wild	-0.579443
tall	-0.742962
giraffe	0.224554
brush	0.000000
eating	-0.366870
field	0.081002
dirt	0.170823
antelope	-0.056490
running	0.000000
bushes	-0.259766
mother	0.000000
path	-0.020852
standing	0.638315
outdoor	-0.762959