Notebook name: ImageShareabilityClassifiers.ipynb

Author: Sreejith Menon (smenon8@uic.edu)

General Description:

Multiple features are extracted per image.
The features are majorly classified as:

  • Bilogical features like age, species, sex
  • Ecological features like yaw, view_point
  • Image EXIF/Quality data: unixtime, latitude, longitude, quality
  • Tags generated by Microsoft Image tagging API
  • Image Contributor - Sparse attribute
  • Individual animals (NID)

Based on these features mutliple classification algorithms are implemented and the metrics are evaluated. The aim of the classification algorithms is to predict given features, will a certain image be shared/not shared on a social media platform.
The ClassifierHelperAPI has off-the-shelf implementations from sk-learn library and uses a Classifier Object to store the metrics of each classifier.
The performance metrics evaluated are:

  • Accuracy - Number of correct predictions in the test data
  • Precision
  • Recall
  • F1 score
  • Absolute Error
  • AUC
  • Squared Error - Not displayed currently
  • Zero One Hinge Loss - Not displayed currently

In [8]:
import ClassiferHelperAPI as CH
import importlib
import numpy as np
import pandas as pd
importlib.reload(CH)
from ast import literal_eval
import plotly.plotly as py
import htmltag as HT
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from collections import Counter
import csv
import plotly.graph_objs as go


Building data for the Classifier

  • Discretizing non-binary data using the bag-of-words model
  • Building and running the classifer for all train-test splits starting from 10% upto 90%
  • Computing the performance metrics for each of the classifier.

In [2]:
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv","sparse","../data/infoGains.csv")
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")

In [13]:
# Block of code for building and running the classifier
# Will generate custom warnings, setting scores to 0, if there are no valid predictions
methods = ['dummy', 'bayesian', 'logistic','svm','dtree','random_forests','ada_boost']
# methods = ['ada_boost']
clfArgs = {'dummy' : {'strategy' : 'most_frequent'},
            'bayesian' : {'fit_prior' : True},
            'logistic' : {'penalty' : 'l2'},
            'svm' : {'kernel' : 'rbf','probability' : True},
            'dtree' : {'criterion' : 'entropy'},
            'random_forests' : {'n_estimators' : 10 },
            'ada_boost' : {'n_estimators' : 50 }}
classifiers = []
for method in methods:
    for i in np.arange(0.4,0.5,0.1):
        clfObj = CH.buildBinClassifier(data,allAttribs,1-i,80,method,clfArgs)
        clfObj.runClf()
        classifiers.append(clfObj)

In [25]:
# Writing all the scores into a pandas data-frame and then into a CSV file
printableClfs = []

for clf in classifiers:
    printableClfs.append(dict(literal_eval(clf.__str__())))
    
df = pd.DataFrame(printableClfs)
df = df[['methodName','splitPercent','accScore','precision','recall','f1Score','auc','sqerr']]
df.columns = ['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']
# df.to_csv("../ClassifierResults/extrmClfMetrics_abv_mean.csv",index=False)

In [26]:
# Will take up valuable Plot.ly plots per day. Limited to 50 plots per day.
# changes to file name important
iFrameBlock = []
for i in np.arange(0.4,0.5,0.1):
    df1 = df[(df['Train-Test Split']==1-i)]
    df1.index = df1['Classifier']
    df1 = df1[['Accuracy','Precision','Recall','F1 score','AUC','Squared Error']].transpose()
    df1.iplot(kind='bar',filename=str('Train-Test_Split_Ratio_abv_mean %f' %i),title=str('Train-Test Split Ratio: %f' %i))
    # iFrameBlock.append(fig.embed_code)

# with open("../ClassifierResults/performanceComparisonsparse.html","w") as perf:
#     perf.write(HT.h1("Performance Comparisons of Classifiers with non_sparse Attributes."))
#     for row in iFrameBlock:
#         perf.write(HT.HTML(row))


Calculating weights of features in the classifiers


In [ ]:
clfWeights = []
for clf in classifiers:
    clfAttribs = list(clf.test_x.columns)
    if clf.methodName == 'logistic':
        clfAttribWgts = list(clf.clfObj.coef_[0])
    elif clf.methodName == 'dtree' or clf.methodName == 'random_forests':
        clfAttribWgts = list(clf.clfObj.feature_importances_)
    else:
        continue
        
        
    attribWgt = {clfAttribs[i] : clfAttribWgts[i] for i in range(len(clfAttribs))}
    attribWgt['Method'] = clf.methodName
    attribWgt['Split_Percent'] = clf.splitPercent
        
    clfWeights.append(attribWgt)

In [ ]:
clfDf = pd.DataFrame(clfWeights)

In [ ]:
indDF = clfDf[(clfDf['Method']=='logistic')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/LogisiticWeights.csv")

indDF = clfDf[(clfDf['Method']=='dtree')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/DecisionTreeWeights.csv")

indDF = clfDf[(clfDf['Method']=='random_forests')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/RandomForestsWeights.csv")

In [ ]:
logisticDf = clfDf[(clfDf['Method']=='logistic')]
del logisticDf['Method']
del logisticDf['Split_Percent']
dtreeDf = clfDf[(clfDf['Method']=='dtree')]
del dtreeDf['Method']
del dtreeDf['Split_Percent']
randomForestDf = clfDf[(clfDf['Method']=='random_forests')]
del randomForestDf['Method']
del randomForestDf['Split_Percent']

In [ ]:
logisticDf = logisticDf.transpose()
logisticDf.reset_index(inplace=True)
logisticDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_logistic = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    logisticDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = logisticDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_logistic.append(df)
    
concatdf_logisitc = pd.concat([dfs_logistic[0],dfs_logistic[1],dfs_logistic[2],dfs_logistic[3],dfs_logistic[4],dfs_logistic[5],dfs_logistic[6],dfs_logistic[7],dfs_logistic[8]],axis=1)
concatdf_logisitc.to_csv("../ClassifierResults/Top15_Weights_Logisitic.csv")

In [ ]:
dtreeDf = dtreeDf.transpose()
dtreeDf.reset_index(inplace=True)
dtreeDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_tree = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    dtreeDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = dtreeDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_tree.append(df)
    
concatdf_dtree = pd.concat([dfs_tree[0],dfs_tree[1],dfs_tree[2],dfs_tree[3],dfs_tree[4],dfs_tree[5],dfs_tree[6],dfs_tree[7],dfs_tree[8]],axis=1)
concatdf_dtree.to_csv("../ClassifierResults/Top15_Weights_Dtree.csv")

In [ ]:
randomForestDf = randomForestDf.transpose()
randomForestDf.reset_index(inplace=True)
randomForestDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_rndf = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    randomForestDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = randomForestDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_rndf.append(df)
    
concatdf_rndf = pd.concat([dfs_rndf[0],dfs_rndf[1],dfs_rndf[2],dfs_rndf[3],dfs_rndf[4],dfs_rndf[5],dfs_rndf[6],dfs_rndf[7],dfs_rndf[8]],axis=1)
concatdf_rndf.to_csv("../ClassifierResults/Top15_Weights_Rndf.csv")

In [ ]:
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
attribs = [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
attribs = [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
logisticDf.sort_values(by='10%',inplace=True,ascending=False)
fig = {
    'data' : [
        {'x' : logisticDf.Feature.head(15),'y' : logisticDf['10%'].head(15), 'mode' : 'markers', 'name' : '10%'}
    ]
}
iplot(fig)

In [ ]:
obj1.precision

In [ ]:
classifiers[0].preds

In [ ]:
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")  
methods = ['dummy','bayesian','logistic','svm','dtree','random_forests','ada_boost']
kwargsDict = {'dummy' : {'strategy' : 'most_frequent'},
            'bayesian' : {'fit_prior' : True},
            'logistic' : {'penalty' : 'l2'},
            'svm' : {'kernel' : 'rbf','probability' : True},
            'dtree' : {'criterion' : 'entropy'},
            'random_forests' : {'n_estimators' : 10 },
            'ada_boost' : {'n_estimators' : 50 }}

In [ ]:
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv",'non_sparse',"../data/infoGainsExpt2.csv")
clfObj = CH.buildBinClassifier(data,allAttribs,1-0.5,80,'dtree',kwargsDict['dtree'])
clfObj.runClf()

In [ ]:
clfObj.precision,clfObj.recall,clfObj.methodName

In [ ]:
fpr,tpr,_ = clfObj.roccurve
rocCurve = {}
for i in range(len(fpr)):
    rocCurve[fpr[i]] = tpr[i]
    
pd.DataFrame(rocCurve,index=['tpr']).transpose().iplot()

In [9]:
CH.getLearningAlgo('random_forests', clfArgs)


Out[9]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [16]:
classifiers[0].clfObj


Out[16]:
DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [28]:
df = df1.transpose().reset_index()
df


Out[28]:
Classifier Accuracy Precision Recall F1 score AUC Squared Error
0 dummy 0.555118 0.555118 1.000000 0.713924 0.500000 0.444882
1 bayesian 0.704724 0.689655 0.851064 0.761905 0.763541 0.295276
2 logistic 0.712598 0.704819 0.829787 0.762215 0.760277 0.287402
3 svm 0.555118 0.555118 1.000000 0.713924 0.722557 0.444882
4 dtree 0.645669 0.664516 0.730496 0.695946 0.635160 0.354331
5 random_forests 0.716535 0.731544 0.773050 0.751724 0.738436 0.283465
6 ada_boost 0.696850 0.690476 0.822695 0.750809 0.727547 0.303150

In [59]:
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=20)
                    ),
                    xaxis= dict(
                        title= 'Classification Quality Metrics',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
          # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=20),
                        tickfont=dict(size=20),
                        title="Percentage (%)"
                        #range=range
                    ),
        barmode='grouped'
                )

trace1 = go.Bar(
                    x = df1.index,
                    name = "Dummy",
                    y = df1['dummy']*100,
                    opacity = 0.5,
                    marker=dict(color='red')
                    
            )

trace2 = go.Bar(
                   x = df1.index,
                    name = "Bayesian",
                    y = df1['bayesian']*100,
                    opacity = 0.5,
                    marker=dict(color='green')
                    
            )

trace3 = go.Bar(
                   x = df1.index,
                    name = "Logistic",
                    y = df1['logistic']*100,
                    opacity = 0.5,
                    marker=dict(color='blue')
                    
            )

trace4 = go.Bar(
                   x = df1.index,
                    name = "SVM",
                    y = df1['svm']*100,
                    opacity = 1,
                    marker=dict(color='pink')
                    
            )

trace5 = go.Bar(
                   x = df1.index,
                    name = "Decision Tree",
                    y = df1['dtree']*100,
                    opacity = 1,
                    marker=dict(color='orange')
                    
            )

trace6 = go.Bar(
                   x = df1.index,
                    name = "Random Forests",
                    y = df1['random_forests']*100,
                    opacity = 0.5,
                    marker=dict(color='brown')
                    
            )

trace7 = go.Bar(
                   x = df1.index,
                    name = "Ada Boost",
                    y = df1['ada_boost']*100,
                    opacity = 1,
                    marker=dict(color='yellow')
                    
            )



data = [trace1, trace2, trace3,trace4,trace5,trace6, trace7]
fig = dict(data=data,layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [36]:
df = df1.reset_index()

In [39]:
df1.index


Out[39]:
Index(['Accuracy', 'Precision', 'Recall', 'F1 score', 'AUC', 'Squared Error'], dtype='object')

In [42]:
df1


Out[42]:
Classifier dummy bayesian logistic svm dtree random_forests ada_boost
Accuracy 0.555118 0.704724 0.712598 0.555118 0.645669 0.716535 0.696850
Precision 0.555118 0.689655 0.704819 0.555118 0.664516 0.731544 0.690476
Recall 1.000000 0.851064 0.829787 1.000000 0.730496 0.773050 0.822695
F1 score 0.713924 0.761905 0.762215 0.713924 0.695946 0.751724 0.750809
AUC 0.500000 0.763541 0.760277 0.722557 0.635160 0.738436 0.727547
Squared Error 0.444882 0.295276 0.287402 0.444882 0.354331 0.283465 0.303150

In [62]:
classifiers[0].roccurve[0]


Out[62]:
array([ 0.,  1.])

In [71]:
layout= go.Layout(
                    showlegend=True,
                    legend=dict(
                        x=1,
                        y=1,
                        font=dict(size=15)
                    ),
                    xaxis= dict(
                        title= 'False Positive Rate (FPR)',
                        ticklen= 5,
                        zeroline= True,
                        titlefont=dict(size=15),
                        tickfont=dict(size=15),
          # tickangle=45
                    ),
                    yaxis=dict(
                        ticklen= 5,
                        titlefont=dict(size=15),
                        tickfont=dict(size=15),
                        title="True Positive Rate (TPR)"
                        #range=range
                    ),
        barmode='grouped'
                )

trace1 = go.Scatter(
                    x = classifiers[0].roccurve[0],
                    name = "Dummy",
                    y = classifiers[0].roccurve[1],
                    opacity = 0.5,
                    marker=dict(color='red')
                    
            )

trace2 = go.Scatter(
                    x = classifiers[1].roccurve[0],
                    name = "Bayesian",
                    y = classifiers[1].roccurve[1],
                    opacity = 0.5,
                    marker=dict(color='green')
                    
            )

trace3 = go.Scatter(
                    x = classifiers[2].roccurve[0],
                    name = "Logistic",
                    y = classifiers[2].roccurve[1],
                    opacity = 1,
                    marker=dict(color='blue')
                    
            )

trace4 = go.Scatter(
                    x = classifiers[3].roccurve[0],
                    name = "SVM",
                    y = classifiers[3].roccurve[1],
                    opacity = 0.5,
                    marker=dict(color='pink')
                    
            )

trace5 = go.Scatter(
                    x = classifiers[4].roccurve[0],
                    name = "Decision Tree",
                    y = classifiers[4].roccurve[1],
                    opacity = 1,
                    marker=dict(color='orange')
                    
            )

trace6 = go.Scatter(
                    x = classifiers[5].roccurve[0],
                    name = "Random Forests",
                    y = classifiers[5].roccurve[1],
                    opacity = 1,
                    marker=dict(color='brown')
                    
            )

trace7 = go.Scatter(
                    x = classifiers[6].roccurve[0],
                    name = "Ada Boost",
                    y = classifiers[6].roccurve[1],
                    opacity = 1,
                    marker=dict(color='yellow')
                    
            )



data = [trace1, trace2, trace3,trace4,trace5,trace6, trace7]
fig = dict(data=data,layout=layout)
iplot(fig,filename="Expt2 Training data distributions")



In [ ]: