Python functions for audit Verify if the script actually does the right job and there is no manual error introduced

In [85]:
import csv
import json
import JobsMapResultsFilesToContainerObjs as ImageMap
import importlib
importlib.reload(ImageMap)


Out[85]:
<module 'JobsMapResultsFilesToContainerObjs' from '/Users/sreejithmenon/Google Drive/PythonCode/AnimalPhotoBias/script/JobsMapResultsFilesToContainerObjs.py'>

In [6]:
jsonObj = json.load(open("../data/experiment2_gid_aid_features.json"))

In [7]:
gidSpeciesList = []
for gid in jsonObj.keys():
    if jsonObj[gid] != None:
        gidSpecies = {}
        for dct in jsonObj[gid]:
            for aid in dct.keys():
                gidSpecies[gid] = gidSpecies.get(gid,[]) + [dct[aid][2][0]]
        gidSpeciesList.append(gidSpecies)

In [12]:
for dct in gidSpeciesList:
    for speciesLst in dct.values():
        firstEle = speciesLst[0]
        for ele in speciesLst:
            if ele != firstEle:
                print(dct.keys())


dict_keys(['1041'])
dict_keys(['6962'])
dict_keys(['6962'])
dict_keys(['978'])
dict_keys(['978'])
dict_keys(['978'])
dict_keys(['1332'])
dict_keys(['1332'])
dict_keys(['5470'])

In [69]:
def extractImageFeaturesFromMap(gidAidMapFl,aidFtrMapFl,feature):    
    aidFeatureDict = ImageMap.genAidFeatureDictDict(aidFtrMapFl)
    
    gidAidDict = ImageMap.genAidGidDictFromMap(gidAidMapFl)

    gidFeatureLst = []
    for gid in gidAidDict:
        if gidAidJson[gid]!= None:
            gidFtr = {}
            for aid in gidAidJson[gid]:
                gidFtr[gid] = gidFtr.get(gid,[]) + [aidFeatureDict[str(aid)][feature]]
        gidFeatureLst.append(gidFtr)
    
    return gidFeatureLst

In [95]:
aidFeatureDict = ImageMap.genAidFeatureDictDict("../data/experiment2_aid_features.json")
gidAidJson = ImageMap.genAidGidDictFromMap("../data/experiment2_gid_aid_map.json")

featuresPerImg = ImageMap.extractImageFeaturesFromMap("../data/experiment2_gid_aid_map.json","../data/experiment2_aid_features.json","SPECIES")

In [96]:
shareCountLogic = {}
for gid in featuresPerImg.keys():
    numInds = len(featuresPerImg[ele])
    isHomogeneous = True
    firstEle = featuresPerImg[ele][0]
    for species in featuresPerImg[ele]:
        if species != firstEle:
            isHomogeneous = False
    if isHomogeneous:
        countFor = firstEle
    else:
        countFor = None
    shareCountLogic[gid] = [numInds,isHomogeneous,countFor]

In [97]:
list(filter(lambda x: not x[2],l))


Out[97]:
[('1041', 2, False, None),
 ('1332', 6, False, None),
 ('6962', 4, False, None),
 ('978', 4, False, None),
 ('5470', 2, False, None)]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [17]:


In [21]:
import PopulationEstimatorFromClf as PE
import pandas as pd
import json
import DeriveFinalResultSet as DRS
import DataStructsHelperAPI as DS

In [4]:
clfTypes = ['bayesian','logistic','svm','dtree','random_forests','ada_boost']
attribTypes = ['sparse','non_sparse','non_zero','abv_mean']

clfTypes = ['bayesian']
attribTypes = ['sparse']
for clf in clfTypes:
    for attrib in attribTypes:
        print("Starting to run %s classifer on test data\nAttribute Selection Method : %s" %(clf,attrib))
        clfObj,predResults = PE.trainTestClf("../FinalResults/ImgShrRnkListWithTags.csv",
                             "../data/full_gid_aid_ftr_agg.csv",
                             clf,
                             attrib,
                             "../data/infoGainsExpt2.csv")


Starting to run bayesian classifer on test data
Attribute Selection Method : sparse

In [12]:
dfPredRes = pd.DataFrame(predResults,index=['share']).transpose().reset_index()
dfPredRes.columns = ['GID','share']
dfPredRes.head()


Out[12]:
GID share
0 1 0
1 10 1
2 1000 0
3 1001 0
4 1002 0

In [24]:
gidMarkRecapSet = genNidMarkRecapDict("../data/imgs_exif_data_full.json","../data/full_gid_aid_map.json","../data/full_aid_features.json","../FinalResults/rankListImages_expt2.csv",days,filterBySpecies='giraffe_masai',shareData=None)

In [27]:
dfGidDays = pd.DataFrame(gidMarkRecapSet,index=['day']).transpose().reset_index()
dfGidDays.columns = ['GID','day']
dfGidDays.head()


Out[27]:
GID day
0 1 1
1 10 1
2 100 1
3 1000 1
4 1001 1

In [29]:
pd.DataFrame.merge(dfPredRes,dfGidDays,on='GID').to_csv("/tmp/audit.dump.csv")

In [ ]:
fixedK = {k : kSharesPerContributor(prediction_probabs,inExifFl,inGidAidMapFl,inAidFtrFl,lambda : k) for k in range(1,11)}