Notebook name: ImageShareabilityClassifiers.ipynb

Author: Sreejith Menon (smenon8@uic.edu)

General Description:

Multiple features are extracted per image.
The features are majorly classified as:

Bilogical features like age, species, sex
Ecological features like yaw, view_point
Image EXIF/Quality data: unixtime, latitude, longitude, quality
Tags generated by Microsoft Image tagging API
Image Contributor - Sparse attribute
Individual animals (NID)

Based on these features mutliple classification algorithms are implemented and the metrics are evaluated. The aim of the classification algorithms is to predict given features, will a certain image be shared/not shared on a social media platform.
The ClassifierHelperAPI has off-the-shelf implementations from sk-learn library and uses a Classifier Object to store the metrics of each classifier.
The performance metrics evaluated are:

Accuracy - Number of correct predictions in the test data
Precision
Recall
F1 score
Absolute Error
AUC
Squared Error - Not displayed currently
Zero One Hinge Loss - Not displayed currently



In [85]:

    
import ClassiferHelperAPI as CH
import importlib
import numpy as np
import pandas as pd
importlib.reload(CH)
from ast import literal_eval
import plotly.plotly as py
import htmltag as HT
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_offline()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from collections import Counter
import csv
import plotly.graph_objs as go

Building data for the Classifier

Discretizing non-binary data using the bag-of-words model
Building and running the classifer for all train-test splits starting from 10% upto 90%
Computing the performance metrics for each of the classifier.



In [ ]:

    
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv","sparse","../data/infoGains.csv")
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")



In [ ]:

    
# Block of code for building and running the classifier
# Will generate custom warnings, setting scores to 0, if there are no valid predictions
methods = ['logistic','svm','dtree','random_forests','ada_boost']
methods = ['ada_boost']

classifiers = []
for method in methods:
    for i in np.arange(0.4,0.5,0.1):
        clfObj = CH.buildBinClassifier(data,allAttribs,1-i,80,method)
        clfObj.runClf()
        classifiers.append(clfObj)



In [ ]:

    
# Writing all the scores into a pandas data-frame and then into a CSV file
printableClfs = []

for clf in classifiers:
    printableClfs.append(dict(literal_eval(clf.__str__())))
    
df = pd.DataFrame(printableClfs)
df = df[['methodName','splitPercent','accScore','precision','recall','f1Score','auc','sqerr']]
df.columns = ['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']
# df.to_csv("../ClassifierResults/extrmClfMetrics_abv_mean.csv",index=False)



In [ ]:

    
# Will take up valuable Plot.ly plots per day. Limited to 50 plots per day.
# changes to file name important
iFrameBlock = []
for i in np.arange(0.4,0.5,0.1):
    df1 = df[(df['Train-Test Split']==1-i)]
    df1.index = df1['Classifier']
    df1 = df1[['Accuracy','Precision','Recall','F1 score','AUC','Squared Error']].transpose()
    df1.iplot(kind='bar',filename=str('Train-Test_Split_Ratio_abv_mean %f' %i),title=str('Train-Test Split Ratio: %f' %i))
    # iFrameBlock.append(fig.embed_code)

# with open("../ClassifierResults/performanceComparisonsparse.html","w") as perf:
#     perf.write(HT.h1("Performance Comparisons of Classifiers with non_sparse Attributes."))
#     for row in iFrameBlock:
#         perf.write(HT.HTML(row))

Calculating weights of features in the classifiers



In [ ]:

    
clfWeights = []
for clf in classifiers:
    clfAttribs = list(clf.test_x.columns)
    if clf.methodName == 'logistic':
        clfAttribWgts = list(clf.clfObj.coef_[0])
    elif clf.methodName == 'dtree' or clf.methodName == 'random_forests':
        clfAttribWgts = list(clf.clfObj.feature_importances_)
    else:
        continue
        
        
    attribWgt = {clfAttribs[i] : clfAttribWgts[i] for i in range(len(clfAttribs))}
    attribWgt['Method'] = clf.methodName
    attribWgt['Split_Percent'] = clf.splitPercent
        
    clfWeights.append(attribWgt)



In [ ]:

    
clfDf = pd.DataFrame(clfWeights)



In [ ]:

    
indDF = clfDf[(clfDf['Method']=='logistic')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/LogisiticWeights.csv")

indDF = clfDf[(clfDf['Method']=='dtree')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/DecisionTreeWeights.csv")

indDF = clfDf[(clfDf['Method']=='random_forests')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/RandomForestsWeights.csv")



In [ ]:

    
logisticDf = clfDf[(clfDf['Method']=='logistic')]
del logisticDf['Method']
del logisticDf['Split_Percent']
dtreeDf = clfDf[(clfDf['Method']=='dtree')]
del dtreeDf['Method']
del dtreeDf['Split_Percent']
randomForestDf = clfDf[(clfDf['Method']=='random_forests')]
del randomForestDf['Method']
del randomForestDf['Split_Percent']



In [ ]:

    
logisticDf = logisticDf.transpose()
logisticDf.reset_index(inplace=True)
logisticDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_logistic = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    logisticDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = logisticDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_logistic.append(df)
    
concatdf_logisitc = pd.concat([dfs_logistic[0],dfs_logistic[1],dfs_logistic[2],dfs_logistic[3],dfs_logistic[4],dfs_logistic[5],dfs_logistic[6],dfs_logistic[7],dfs_logistic[8]],axis=1)
concatdf_logisitc.to_csv("../ClassifierResults/Top15_Weights_Logisitic.csv")



In [ ]:

    
dtreeDf = dtreeDf.transpose()
dtreeDf.reset_index(inplace=True)
dtreeDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_tree = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    dtreeDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = dtreeDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_tree.append(df)
    
concatdf_dtree = pd.concat([dfs_tree[0],dfs_tree[1],dfs_tree[2],dfs_tree[3],dfs_tree[4],dfs_tree[5],dfs_tree[6],dfs_tree[7],dfs_tree[8]],axis=1)
concatdf_dtree.to_csv("../ClassifierResults/Top15_Weights_Dtree.csv")



In [ ]:

    
randomForestDf = randomForestDf.transpose()
randomForestDf.reset_index(inplace=True)
randomForestDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_rndf = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    randomForestDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = randomForestDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_rndf.append(df)
    
concatdf_rndf = pd.concat([dfs_rndf[0],dfs_rndf[1],dfs_rndf[2],dfs_rndf[3],dfs_rndf[4],dfs_rndf[5],dfs_rndf[6],dfs_rndf[7],dfs_rndf[8]],axis=1)
concatdf_rndf.to_csv("../ClassifierResults/Top15_Weights_Rndf.csv")



In [ ]:

    
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)



In [ ]:

    
attribs = [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)



In [ ]:

    
attribs = [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)



In [ ]:

    
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)



In [ ]:

    
logisticDf.sort_values(by='10%',inplace=True,ascending=False)
fig = {
    'data' : [
        {'x' : logisticDf.Feature.head(15),'y' : logisticDf['10%'].head(15), 'mode' : 'markers', 'name' : '10%'}
    ]
}
iplot(fig)



In [ ]:

    
obj1.precision



In [ ]:

    
classifiers[0].preds



In [ ]:

    
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")  
methods = ['dummy','bayesian','logistic','svm','dtree','random_forests','ada_boost']
kwargsDict = {'dummy' : {'strategy' : 'most_frequent'},
            'bayesian' : {'fit_prior' : True},
            'logistic' : {'penalty' : 'l2'},
            'svm' : {'kernel' : 'rbf','probability' : True},
            'dtree' : {'criterion' : 'entropy'},
            'random_forests' : {'n_estimators' : 10 },
            'ada_boost' : {'n_estimators' : 50 }}



In [ ]:

    
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv",'non_sparse',"../data/infoGainsExpt2.csv")
clfObj = CH.buildBinClassifier(data,allAttribs,1-0.5,80,'dtree',kwargsDict['dtree'])
clfObj.runClf()



In [ ]:

    
clfObj.precision,clfObj.recall,clfObj.methodName



In [ ]:

    
fpr,tpr,_ = clfObj.roccurve
rocCurve = {}
for i in range(len(fpr)):
    rocCurve[fpr[i]] = tpr[i]
    
pd.DataFrame(rocCurve,index=['tpr']).transpose().iplot()



In [ ]: