notebook.community

Edit and run



In [1]:

    
import json
from os import listdir
from os.path import isfile, join
import csv



In [2]:

    
mypath = "jsons"
json_files = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and ".json" in f)]

# change this based on the project
pybossa_url = "http://crowd.globalfishingwatch.org/project/maptests2/task/"



In [3]:

    
users = ["" for i in range(21)]
users[1]='davidkroodsma'
users[3]='davidkroodsmathe2nd'
users[4]='alexwilson'
users[5]='chris'
users[2]='bjornbergman'
users[6]='enriquetuya'
users[7]='kristinaboerder'
users[8]='vaiduke2'
users[9]='katepepler'
users[10]='stephanielewis'
users[11]='AlexCerra'
users[12]='juliecharbonneau'
users[13]='ninagalle'
users[14]='sidneyblack-rotchin'
users[15]='daivdtest'
users[16]='cailinburmaster'
users[17]='elizabethnagel'
users[18]='isabelfleisher'
users[19]='ciarawillis'
users[20]='clairechristie'
user_count = [0 for i in range (21)]



In [4]:

    
jsons = []
for j in json_files:
    f = open("jsons/" + j)
    contents = f.read()
    jsons.append(json.loads(contents))



In [36]:

    
for j in jsons:
#    print j
    k = json.loads(j[-1]['info'].replace("\n"," "))#['mmsi']# == '200000055':
    if k['mmsi'] == '200000055':
        r = {} # the results for this vessel
        v = {} # counts of the vessel types
        vessel_responses = ['longliner','purse_seine','trawler','reefer','multigear',
        'baddata','otherfish','not_fishing','muti_gear','other','not_known']
        for vr in vessel_responses:
            v[vr] = 0 
        r['responses'] = 0
        r['findyes'] = 0
        r['findno'] = 0
        r['othertext'] = ''
        r['didntsearch'] = 0
        r['websitesfound'] = ''
        r['theusers'] = ''
        responses = 0
        for k in j:
            if len(k['info'])>20 and k['user_id'] != 1: #ignore david's results
                r['responses'] += 1
                user_response = json.loads(k['info'].replace("\n",""))
                r['mmsi'] = user_response["mmsi"]
                vt = user_response['vesselType'].lower()
                if vt ==  vt == 'other fishing ' or vt == "other_fishing" or vt == "other fishing" or vt == " other fishing":
                    vt = 'otherfish'
                if vt == 'bad_data' or vt == 'not_enough_data': 
                    vt = 'baddata'
                    print "yes", vt
                if vt not in vessel_responses:
                    r['othertext'] += vt+","
                    print vt
                    vt = "other"
                v[vt]+=1
                if user_response['search_url']: 
                    r['websitesfound'] += user_response['search_url']+","
                r['theusers'] += users[k['user_id']] + ","
                user_count[k['user_id']]+=1









    



yes baddata
yes baddata
yes baddata



In [37]:

    
results = []
for j in jsons:
    r = {} # the results for this vessel
    v = {} # counts of the vessel types
    vessel_responses = ['longliner','purse_seine','trawler','reefer','multigear',
    'baddata','otherfish','not_fishing','muti_gear','other','not_known']
    for vr in vessel_responses:
        v[vr] = 0 
    r['responses'] = 0
    r['findyes'] = 0
    r['findno'] = 0
    r['othertext'] = ''
    r['didntsearch'] = 0
    r['websitesfound'] = ''
    r['theusers'] = ''
    responses = 0
    for k in j:
        if len(k['info'])>20 and k['user_id'] != 1: #ignore david's results
            r['responses'] += 1
            user_response = json.loads(k['info'].replace("\n",""))
            r['mmsi'] = user_response["mmsi"]
            vt = user_response['vesselType'].lower()
            if vt ==  vt == 'other fishing ' or vt == "other_fishing" or vt == "other fishing" or vt == " other fishing":
                vt = 'otherfish'
            if vt == 'bad_data' or vt == 'not_enough_data': 
                vt = 'baddata'        
            if vt not in vessel_responses:
                r['othertext'] += vt+","
                vt = "other"
            v[vt]+=1
            if user_response['search_url']: 
                r['websitesfound'] += user_response['search_url']+","
            r['theusers'] += users[k['user_id']] + ","
            user_count[k['user_id']]+=1
    r['url'] = pybossa_url + str(j[0]['task_id'])
    if len(r['theusers'])>1:r['theusers'] = r['theusers'][:-1] # get rid of comma at end
    if len(r['websitesfound'])>1: r['websitesfound'] = r['websitesfound'][:-1] # get rid of comma at end
    r['vessel'] = v
    results.append(r)



In [38]:

    
# if there is agreement

outputs = []

for r in results:
    output = {}
    headers = ['mmsi','student_label','student_confidence','pybossa_url']
    output['mmsi'] = r['mmsi']
    output['pybossa_url'] = r['url']
    
    # now figure out what the students thought it was
    v_types = []
    v = r['vessel']
    for vr in vessel_responses:
        if v[vr]: 
            if vr not in v_types: v_types.append(vr)
                
    # three casses:
    # 1) Total agreement and an answer
    # 2) Total agreement but all say "not_known"
    # 3) some think they know what it is, other say "not_known"
    # 4) disagreement
    if len(v_types) == 1 and v_types[0] != 'not_known':
        output['student_confidence'] = 1
        output['student_label'] = v_types[0]
        
    elif len(v_types) == 1 and v_types[0] == 'not_known':
        output['student_confidence'] = 0
        output['student_label'] = v_types[0]
        
        #print r['mmsi'], model_scores[r['mmsi']]['label'], v_types[0], r['url']# sum([v[i] for i in v])
        
    elif len(v_types) == 2 and "not_known" in v_types:
        total = sum([v[i] for i in v])
        not_knowns = sum([v[i] for i in v if i !='not_known'])
        output['student_confidence'] = float(not_knowns)/total
        if v_types[0] == 'not_known': output['student_label'] = v_types[1]
        else: output['student_label'] = v_types[0]
    
    else:
        total = sum([v[i] for i in v])
        not_knowns = sum([v[i] for i in v if i !='not_known'])
        output['student_confidence'] = float(not_knowns)/total*.3 # if there is disagreement, it maxes at .3
        label_string = ''
        for vessel in v_types:
            label_string += vessel+":"+str(v[vessel])+","
        label_string = label_string[:-1] # get rid of comma
        output['student_label'] = label_string
    
    
    outputs.append(output)
    
outputs = sorted(outputs, key=lambda k: -k['student_confidence']) 

outdir = "../"
filename = "FishingVesselsV1_20160314_agreement.csv"

with open(outdir+filename,'w') as f:
    f_csv = csv.DictWriter(f, headers)
    f_csv.writeheader()
    f_csv.writerows(outputs)



In [ ]: