Contains code for performing feature selection using Information Gain

Author: Sreejith Menon(smenon8@uic.edu)


In [1]:
import ClassiferHelperAPI as CH
import pandas as pd
import sys
import FeatureSelectionAPI as FS
import importlib
importlib.reload(FS)
from numpy import mean
import csv

In [21]:
# Generating attributes, converting categorical attributes into discrete binary output.
# For instance - SPECIES : Zebra will be converted into (Zebra: 1, Giraffe: 0 .. )
hasSparse = False
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")
if hasSparse:   
    ftrList = ['SPECIES','SEX','AGE','QUALITY','VIEW_POINT','INDIVIDUAL_NAME','CONTRIBUTOR','tags'] 
else:
    ftrList = ['SPECIES','SEX','AGE','QUALITY','VIEW_POINT'] #,'tags']
    
allAttribs = CH.genAttribsHead(data,ftrList)

In [25]:
ftrList = ['INDIVIDUAL_NAME','CONTRIBUTOR','tags'] 
allAttribs = CH.genAttribsHead(data,ftrList)

In [26]:
gidAttribDict = CH.createDataFlDict(data,allAttribs,80,'Train') # binaryClf attribute in createDataFlDict will be True here

In [27]:
df = pd.DataFrame.from_dict(gidAttribDict).transpose()
df = df[allAttribs+["TARGET"]]
df.head(5)


Out[27]:
IBEIS_PZ_0180 IBEIS_PZ_0468 IBEIS_PZ_0872 IBEIS_PZ_0260 IBEIS_PZ_0897 IBEIS_PZ_0570 IBEIS_PZ_1571 IBEIS_PZ_0105 IBEIS_PZ_0726 IBEIS_PZ_1392 ... tall lone mammal group running grass walking way tree TARGET
10 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 1.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0
1005 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
1024 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
1041 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0
1045 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0

5 rows × 851 columns


In [24]:
infoGains = [(col,FS.infoGain(df[col],df.TARGET)) for col in df.columns]

In [28]:
for col in df.columns:
    infoGains.append((col,FS.infoGain(df[col],df.TARGET)))
infoGains = sorted(infoGains,key = lambda x : x[1],reverse=True)
infoGains = infoGains[2:]
infoGains

with open("../data/infoGainsExpt2.csv","w") as infGainFl:
    csvWrite = csv.writer(infGainFl)
    
    for row in infoGains:
        csvWrite.writerow(row)

In [10]:
len(infoGains)


Out[10]:
874

In [ ]: