In [1]:
import sys
sys.path.append("../script/")
import csv
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from math import floor
import pandas as pd
import ClassiferHelperAPI as CH
import importlib
importlib.reload(CH)
from sklearn import metrics
from collections import OrderedDict
import math
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,mean_absolute_error, mean_squared_error,zero_one_loss

In [2]:
pd.DataFrame.from_csv("../FinalResults/rankListImages_expt2.csv")

with open("../FinalResults/rankListImages_expt2.csv","r") as inpFl:
    reader = csv.reader(inpFl)
    head = reader.__next__()
    data = [row for row in reader]

In [3]:
propCount = [float(row[4]) for row in data]
distribFnc = dict(Counter(propCount))
# df = pd.DataFrame(cdf,index=['Counter']).transpose()

In [4]:
# plotting histograms - RELEVANT, do not delete, but incorrect
bins = [0,10,20,30,40,50,60,70,80,90,100]

a,b,c = plt.hist(propCount,bins,histtype='bar')
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')

In [47]:
bucket = dict.fromkeys([i for i in range(0,110,10)],[])

for prop in propCount:
    key = math.ceil(prop/10) * 10
    bucket[key] = bucket.get(key,[]) + [prop] 

bucketCount = {}
for key in bucket:
    bucketCount[key] = len(bucket[key])
    
df = pd.DataFrame(bucketCount,index=['y']).transpose()
df.plot()
plt.ylabel("Number of images")
plt.xlabel("Share proportion")
plt.title("Share proportion(X-axis) versus Number of Images shared x% of times(Y-axis)")
plt.xticks(np.arange(0,100+1,10))
plt.yticks(np.arange(0, 200+1, 10.0))
plt.grid(b=True, which='major', color='k', linestyle=':')
plt.savefig("../ClassifierResults/shareProportionVsNumImgsMod.png",bbox_inches='tight')

In [37]:
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
perfMetricDict = {}
performance_metric_dict = []
for method in methods:
    for i in np.arange(0.1,1,0.1):
        temp = OrderedDict()
        clf,test_x, test_y, predics, pred_prob = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
        accScore = accuracy_score(test_y,predics)
        f1Score = f1_score(test_y,predics)
        precision = precision_score(test_y,predics)
        recall = recall_score(test_y,predics)
        auc = roc_auc_score(test_y,pred_prob)
        abserr = mean_absolute_error(test_y,predics)
        sqerr = mean_squared_error(test_y,predics)
        zerooneloss = zero_one_loss(test_y,predics)
        temp['Classifier'] = method
        temp['Train-Test Split'] = i * 100
        temp['Accuracy'] = accScore
        temp['Precision'] = precision
        temp['Recall'] = recall
        temp['AUC'] = auc
        temp['F1 score'] = f1Score
        temp['Squared Error'] = sqerr
        performance_metric_dict.append(temp)
        clfDict[(method,i)] = clf
        fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
        fl.write(classification_report(test_y,predics))
        
fl.close()


/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)

In [38]:
df = pd.DataFrame(performance_metric_dict)

df = df[['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']]
df.to_csv("../ClassifierResults/perform_metrics_extrmClf1.csv",index=False)
df


Out[38]:
Classifier Train-Test Split Accuracy Precision Recall F1 score AUC Squared Error
0 logistic 10 0.767442 0.800000 0.800000 0.800000 0.788889 0.232558
1 logistic 20 0.705882 0.734694 0.750000 0.742268 0.754505 0.294118
2 logistic 30 0.740157 0.740260 0.814286 0.775510 0.779449 0.259843
3 logistic 40 0.715976 0.711538 0.804348 0.755102 0.774139 0.284024
4 logistic 50 0.710900 0.697842 0.836207 0.760784 0.754991 0.289100
5 logistic 60 0.712598 0.704819 0.829787 0.762215 0.760215 0.287402
6 logistic 70 0.702703 0.740506 0.713415 0.726708 0.744041 0.297297
7 logistic 80 0.656805 0.673367 0.724324 0.697917 0.707613 0.343195
8 logistic 90 0.631579 0.719512 0.556604 0.627660 0.704220 0.368421
9 svm 10 0.581395 0.581395 1.000000 0.735294 0.797778 0.418605
10 svm 20 0.564706 0.564706 1.000000 0.721805 0.757320 0.435294
11 svm 30 0.551181 0.551181 1.000000 0.710660 0.725815 0.448819
12 svm 40 0.544379 0.544379 1.000000 0.704981 0.732072 0.455621
13 svm 50 0.549763 0.549763 1.000000 0.709480 0.720690 0.450237
14 svm 60 0.555118 0.555118 1.000000 0.713924 0.719921 0.444882
15 svm 70 0.554054 0.554054 1.000000 0.713043 0.363082 0.445946
16 svm 80 0.547337 0.547337 1.000000 0.707457 0.376647 0.452663
17 svm 90 0.442105 0.000000 0.000000 0.000000 0.646058 0.557895
18 dtree 10 0.697674 0.714286 0.800000 0.754717 0.672222 0.302326
19 dtree 20 0.576471 0.642857 0.562500 0.600000 0.578547 0.423529
20 dtree 30 0.700787 0.710526 0.771429 0.739726 0.690727 0.299213
21 dtree 40 0.686391 0.709677 0.717391 0.713514 0.685065 0.313609
22 dtree 50 0.625592 0.635036 0.750000 0.687747 0.613884 0.374408
23 dtree 60 0.657480 0.670886 0.751773 0.709030 0.645798 0.342520
24 dtree 70 0.628378 0.662651 0.670732 0.666667 0.623245 0.371622
25 dtree 80 0.597633 0.635359 0.621622 0.628415 0.595125 0.402367
26 dtree 90 0.621053 0.650442 0.693396 0.671233 0.611579 0.378947
27 random_forests 10 0.697674 0.800000 0.640000 0.711111 0.790000 0.302326
28 random_forests 20 0.694118 0.739130 0.708333 0.723404 0.678491 0.305882
29 random_forests 30 0.685039 0.750000 0.642857 0.692308 0.760777 0.314961
30 random_forests 40 0.662722 0.688172 0.695652 0.691892 0.706875 0.337278
31 random_forests 50 0.672986 0.685039 0.750000 0.716049 0.718013 0.327014
32 random_forests 60 0.633858 0.650000 0.737589 0.691030 0.688445 0.366142
33 random_forests 70 0.695946 0.725610 0.725610 0.725610 0.728127 0.304054
34 random_forests 80 0.633136 0.691824 0.594595 0.639535 0.697421 0.366864
35 random_forests 90 0.557895 0.683333 0.386792 0.493976 0.644612 0.442105

In [6]:
clf,test_x, test_y, predics, pred_prob,score = CH.buildBinClassifier(data,allAttribs,0.4,80,'logistic',True)

In [88]:
for key in clfDict:
    clf = clfDict[key]
    
    if key[0] == 'random_forests':
        print(key, clf.estimators_)
        break
        
print(len(test_x.columns))


('random_forests', 0.20000000000000001) [DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1865640145, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=274807701, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1148070046, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=894216041, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1570411674, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=668350761, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=901181377, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=707182900, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1860867144, splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=793565307, splitter='best')]
835

In [65]:
d = OrderedDict.fromkeys(test_x.columns,0)

for i in range(len(clf.coef_[0])):
    d[test_x.columns[i]] = clf.coef_[0][i]

In [69]:
pd.DataFrame(d,index=['co-efficients']).transpose()


Out[69]:
co-efficients
zebra_plains -0.468173
giraffe_masai 0.516168
UNKNOWN NAME -0.043396
Female 0.498140
UNKNOWN SEX -0.409351
Male -0.210480
unknown -0.406869
juveniles - one year old -0.351691
juveniles- two year old -0.268446
adult 0.034764
infant 0.125538
excellent 0.818747
poor 0.228112
good 0.917564
ok 0.414937
junk -0.162535
backright 0.632304
front 0.112286
frontleft 0.594611
right 0.400426
left -0.441930
back -0.418995
frontright 0.184198
backleft -0.916222
IBEIS_PZ_1221 0.000000
IBEIS_PZ_0886 0.391212
IBEIS_PZ_1547 0.000000
IBEIS_PZ_0956 0.000000
IBEIS_PZ_1845 0.000000
NNP_GIRM_0116 -0.196775
... ...
sheep -0.293586
arthropod 0.000000
elephant -0.147098
lone -0.099475
group -1.042633
mountain 0.001608
laying 0.000000
animal 0.345434
dry -0.333251
grazing -0.246223
cactus -0.093977
hill 0.237482
brown 0.159295
sky -0.304497
mammal 0.324114
road 0.920785
wild -0.579443
tall -0.742962
giraffe 0.224554
brush 0.000000
eating -0.366870
field 0.081002
dirt 0.170823
antelope -0.056490
running 0.000000
bushes -0.259766
mother 0.000000
path -0.020852
standing 0.638315
outdoor -0.762959

832 rows × 1 columns


In [34]:
d = CH.createDataFlDict(data,allAttribs,True,80,True)

In [35]:
# number of positive/negative examples:

posEgs = 0
negEgs = 0

for key in d.keys():
    if d[key]['TARGET'] == 1:
        posEgs += 1
    else:
        negEgs += 1

In [36]:
posEgs,negEgs


Out[36]:
(230, 192)

In [19]:
print(len(list(filter(lambda x : float(data[x]['Proportion']) >= 80.0,data.keys()))))
print(len(list(filter(lambda x : float(data[x]['Proportion']) <= 20.0,data.keys()))))


230
192

In [20]:
230 + 192


Out[20]:
422

In [21]:
fDataKeys= list(filter(lambda x : float(data[x]['Proportion']) >= 80.0 or float(data[x]['Proportion']) <= 20.0,data.keys()))
fData = {gid: data[gid] for gid in fDataKeys}

In [30]:
pd.DataFrame(fData).transpose()['Proportion'].to_csv("/tmp/props.csv")

In [32]:
pd.DataFrame(d).transpose()['TARGET'].to_csv("/tmp/targs.csv")

In [50]:
methods = ['logistic','svm','dtree','random_forests']
fl = open("/tmp/extrmClfReport.dat","w")
perfMetricDict = {}
performance_metric_dict = []
for method in methods:
    for i in np.arange(0.1,1,0.1):
        temp = OrderedDict()
        clf,test_x, test_y, predics, pred_prob = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
        temp = CH.evalClassifierPerf(clf,test_x,test_y,predics,pred_prob)
        temp['Method'] = method
        temp['Train-Test Split'] = i * 100
        performance_metric_dict.append(temp)
        clfDict[(method,i)] = clf
        fl.write(str("Method: " + method + "\nSplit Ratio: " + str(i) +"\n\n"))
        fl.write(classification_report(test_y,predics))
        
fl.close()


/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/sreejithmenon/anaconda/lib/python3.5/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-50-aacefb79a1c2> in <module>()
      6     for i in np.arange(0.1,1,0.1):
      7         temp = OrderedDict()
----> 8         clf,test_x, test_y, predics, pred_prob = CH.buildBinClassifier(data,allAttribs,i,80,method,True)
      9         temp = CH.evalClassifierPerf(clf,test_x,test_y,predics,pred_prob)
     10         performance_metric_dict.append(temp)

/Users/sreejithmenon/Google Drive/Project/AnimalPhotoBias/script/ClassiferHelperAPI.py in buildBinClassifier(data, allAttribs, trainTestSplit, threshold, methodName, extremeClf)
     98 # returns test attributes, actual test TARGET, predicted values and predicition probabilities
     99 def buildBinClassifier(data,allAttribs,trainTestSplit,threshold,methodName,extremeClf=True):
--> 100     gidAttribDict = createDataFlDict(data,allAttribs,True,threshold,extremeClf) # binaryClf attribute in createDataFlDict will be True here
    101 
    102     train_x,test_x,train_y,test_y = trainTestSplitter(gidAttribDict,allAttribs,trainTestSplit) # new statement

/Users/sreejithmenon/Google Drive/Project/AnimalPhotoBias/script/ClassiferHelperAPI.py in createDataFlDict(data, allAttribs, binaryClf, threshold, extremeClf)
     61         gidAttribDict[gid] = attribDict
     62 
---> 63     json.dump(gidAttribDict,open("/tmp/gidAttribDict.json","w"),indent=4)
     64 
     65     pd.DataFrame(gidAttribDict).transpose().to_csv("/tmp/gidAttribDict.csv")

/Users/sreejithmenon/anaconda/lib/python3.5/json/__init__.py in dump(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
    176     # could accelerate with writelines in some versions of Python, at
    177     # a debuggability cost
--> 178     for chunk in iterable:
    179         fp.write(chunk)
    180 

/Users/sreejithmenon/anaconda/lib/python3.5/json/encoder.py in _iterencode(o, _current_indent_level)
    427             yield from _iterencode_list(o, _current_indent_level)
    428         elif isinstance(o, dict):
--> 429             yield from _iterencode_dict(o, _current_indent_level)
    430         else:
    431             if markers is not None:

/Users/sreejithmenon/anaconda/lib/python3.5/json/encoder.py in _iterencode_dict(dct, _current_indent_level)
    401                 else:
    402                     chunks = _iterencode(value, _current_indent_level)
--> 403                 yield from chunks
    404         if newline_indent is not None:
    405             _current_indent_level -= 1

KeyboardInterrupt: 

In [2]:
import mpl_toolkits as mt

In [5]:
from mpl_toolkits.basemap import Basemap


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-5-5e6824321d57> in <module>()
----> 1 from mpl_toolkits.basemap import Basemap

ImportError: No module named 'mpl_toolkits.basemap'

In [ ]: