Notebook name: ImageShareabilityClassifiers.ipynb

Author: Sreejith Menon (smenon8@uic.edu)

General Description:

Multiple features are extracted per image.
The features are majorly classified as:

  • Bilogical features like age, species, sex
  • Ecological features like yaw, view_point
  • Image EXIF/Quality data: unixtime, latitude, longitude, quality
  • Tags generated by Microsoft Image tagging API
  • Image Contributor - Sparse attribute
  • Individual animals (NID)

Based on these features mutliple classification algorithms are implemented and the metrics are evaluated. The aim of the classification algorithms is to predict given features, will a certain image be shared/not shared on a social media platform.
The ClassifierHelperAPI has off-the-shelf implementations from sk-learn library and uses a Classifier Object to store the metrics of each classifier.
The performance metrics evaluated are:

  • Accuracy - Number of correct predictions in the test data
  • Precision
  • Recall
  • F1 score
  • Absolute Error
  • AUC
  • Squared Error - Not displayed currently
  • Zero One Hinge Loss - Not displayed currently

In [85]:
import ClassiferHelperAPI as CH
import importlib
import numpy as np
import pandas as pd
importlib.reload(CH)
from ast import literal_eval
import plotly.plotly as py
import htmltag as HT
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from collections import Counter
import csv
import plotly.graph_objs as go


Building data for the Classifier

  • Discretizing non-binary data using the bag-of-words model
  • Building and running the classifer for all train-test splits starting from 10% upto 90%
  • Computing the performance metrics for each of the classifier.

In [ ]:
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv","sparse","../data/infoGains.csv")
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")

In [ ]:
# Block of code for building and running the classifier
# Will generate custom warnings, setting scores to 0, if there are no valid predictions
methods = ['logistic','svm','dtree','random_forests','ada_boost']
methods = ['ada_boost']

classifiers = []
for method in methods:
    for i in np.arange(0.4,0.5,0.1):
        clfObj = CH.buildBinClassifier(data,allAttribs,1-i,80,method)
        clfObj.runClf()
        classifiers.append(clfObj)

In [ ]:
# Writing all the scores into a pandas data-frame and then into a CSV file
printableClfs = []

for clf in classifiers:
    printableClfs.append(dict(literal_eval(clf.__str__())))
    
df = pd.DataFrame(printableClfs)
df = df[['methodName','splitPercent','accScore','precision','recall','f1Score','auc','sqerr']]
df.columns = ['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']
# df.to_csv("../ClassifierResults/extrmClfMetrics_abv_mean.csv",index=False)

In [ ]:
# Will take up valuable Plot.ly plots per day. Limited to 50 plots per day.
# changes to file name important
iFrameBlock = []
for i in np.arange(0.4,0.5,0.1):
    df1 = df[(df['Train-Test Split']==1-i)]
    df1.index = df1['Classifier']
    df1 = df1[['Accuracy','Precision','Recall','F1 score','AUC','Squared Error']].transpose()
    df1.iplot(kind='bar',filename=str('Train-Test_Split_Ratio_abv_mean %f' %i),title=str('Train-Test Split Ratio: %f' %i))
    # iFrameBlock.append(fig.embed_code)

# with open("../ClassifierResults/performanceComparisonsparse.html","w") as perf:
#     perf.write(HT.h1("Performance Comparisons of Classifiers with non_sparse Attributes."))
#     for row in iFrameBlock:
#         perf.write(HT.HTML(row))

Calculating weights of features in the classifiers


In [ ]:
clfWeights = []
for clf in classifiers:
    clfAttribs = list(clf.test_x.columns)
    if clf.methodName == 'logistic':
        clfAttribWgts = list(clf.clfObj.coef_[0])
    elif clf.methodName == 'dtree' or clf.methodName == 'random_forests':
        clfAttribWgts = list(clf.clfObj.feature_importances_)
    else:
        continue
        
        
    attribWgt = {clfAttribs[i] : clfAttribWgts[i] for i in range(len(clfAttribs))}
    attribWgt['Method'] = clf.methodName
    attribWgt['Split_Percent'] = clf.splitPercent
        
    clfWeights.append(attribWgt)

In [ ]:
clfDf = pd.DataFrame(clfWeights)

In [ ]:
indDF = clfDf[(clfDf['Method']=='logistic')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/LogisiticWeights.csv")

indDF = clfDf[(clfDf['Method']=='dtree')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/DecisionTreeWeights.csv")

indDF = clfDf[(clfDf['Method']=='random_forests')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/RandomForestsWeights.csv")

In [ ]:
logisticDf = clfDf[(clfDf['Method']=='logistic')]
del logisticDf['Method']
del logisticDf['Split_Percent']
dtreeDf = clfDf[(clfDf['Method']=='dtree')]
del dtreeDf['Method']
del dtreeDf['Split_Percent']
randomForestDf = clfDf[(clfDf['Method']=='random_forests')]
del randomForestDf['Method']
del randomForestDf['Split_Percent']

In [ ]:
logisticDf = logisticDf.transpose()
logisticDf.reset_index(inplace=True)
logisticDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_logistic = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    logisticDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = logisticDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_logistic.append(df)
    
concatdf_logisitc = pd.concat([dfs_logistic[0],dfs_logistic[1],dfs_logistic[2],dfs_logistic[3],dfs_logistic[4],dfs_logistic[5],dfs_logistic[6],dfs_logistic[7],dfs_logistic[8]],axis=1)
concatdf_logisitc.to_csv("../ClassifierResults/Top15_Weights_Logisitic.csv")

In [ ]:
dtreeDf = dtreeDf.transpose()
dtreeDf.reset_index(inplace=True)
dtreeDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_tree = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    dtreeDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = dtreeDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_tree.append(df)
    
concatdf_dtree = pd.concat([dfs_tree[0],dfs_tree[1],dfs_tree[2],dfs_tree[3],dfs_tree[4],dfs_tree[5],dfs_tree[6],dfs_tree[7],dfs_tree[8]],axis=1)
concatdf_dtree.to_csv("../ClassifierResults/Top15_Weights_Dtree.csv")

In [ ]:
randomForestDf = randomForestDf.transpose()
randomForestDf.reset_index(inplace=True)
randomForestDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_rndf = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    randomForestDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = randomForestDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_rndf.append(df)
    
concatdf_rndf = pd.concat([dfs_rndf[0],dfs_rndf[1],dfs_rndf[2],dfs_rndf[3],dfs_rndf[4],dfs_rndf[5],dfs_rndf[6],dfs_rndf[7],dfs_rndf[8]],axis=1)
concatdf_rndf.to_csv("../ClassifierResults/Top15_Weights_Rndf.csv")

In [ ]:
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
attribs = [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
attribs = [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [ ]:
logisticDf.sort_values(by='10%',inplace=True,ascending=False)
fig = {
    'data' : [
        {'x' : logisticDf.Feature.head(15),'y' : logisticDf['10%'].head(15), 'mode' : 'markers', 'name' : '10%'}
    ]
}
iplot(fig)

In [ ]:
obj1.precision

In [ ]:
classifiers[0].preds

In [ ]:
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")  
methods = ['dummy','bayesian','logistic','svm','dtree','random_forests','ada_boost']
kwargsDict = {'dummy' : {'strategy' : 'most_frequent'},
            'bayesian' : {'fit_prior' : True},
            'logistic' : {'penalty' : 'l2'},
            'svm' : {'kernel' : 'rbf','probability' : True},
            'dtree' : {'criterion' : 'entropy'},
            'random_forests' : {'n_estimators' : 10 },
            'ada_boost' : {'n_estimators' : 50 }}

In [ ]:
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv",'non_sparse',"../data/infoGainsExpt2.csv")
clfObj = CH.buildBinClassifier(data,allAttribs,1-0.5,80,'dtree',kwargsDict['dtree'])
clfObj.runClf()

In [ ]:
clfObj.precision,clfObj.recall,clfObj.methodName

In [ ]:
fpr,tpr,_ = clfObj.roccurve
rocCurve = {}
for i in range(len(fpr)):
    rocCurve[fpr[i]] = tpr[i]
    
pd.DataFrame(rocCurve,index=['tpr']).transpose().iplot()

In [ ]: