In [1]:
import ClassiferHelperAPI as CH
import pandas as pd
import sys
import FeatureSelectionAPI as FS
import importlib
importlib.reload(FS)
from numpy import mean
import csv
In [21]:
# Generating attributes, converting categorical attributes into discrete binary output.
# For instance - SPECIES : Zebra will be converted into (Zebra: 1, Giraffe: 0 .. )
hasSparse = False
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")
if hasSparse:
ftrList = ['SPECIES','SEX','AGE','QUALITY','VIEW_POINT','INDIVIDUAL_NAME','CONTRIBUTOR','tags']
else:
ftrList = ['SPECIES','SEX','AGE','QUALITY','VIEW_POINT'] #,'tags']
allAttribs = CH.genAttribsHead(data,ftrList)
In [25]:
ftrList = ['INDIVIDUAL_NAME','CONTRIBUTOR','tags']
allAttribs = CH.genAttribsHead(data,ftrList)
In [26]:
gidAttribDict = CH.createDataFlDict(data,allAttribs,80,'Train') # binaryClf attribute in createDataFlDict will be True here
In [27]:
df = pd.DataFrame.from_dict(gidAttribDict).transpose()
df = df[allAttribs+["TARGET"]]
df.head(5)
Out[27]:
In [24]:
infoGains = [(col,FS.infoGain(df[col],df.TARGET)) for col in df.columns]
In [28]:
for col in df.columns:
infoGains.append((col,FS.infoGain(df[col],df.TARGET)))
infoGains = sorted(infoGains,key = lambda x : x[1],reverse=True)
infoGains = infoGains[2:]
infoGains
with open("../data/infoGainsExpt2.csv","w") as infGainFl:
csvWrite = csv.writer(infGainFl)
for row in infoGains:
csvWrite.writerow(row)
In [10]:
len(infoGains)
Out[10]:
In [ ]: