notebook.community

Edit and run



In [1]:

    
import json
from os import listdir
from os.path import isfile, join
import csv



In [2]:

    
mypath = "jsons"
json_files = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and ".json" in f)]

# change this based on the project
pybossa_url = "http://crowd.globalfishingwatch.org/project/id_fishing_vessels_v2/task/"



In [3]:

    
sourcedir = 'input_files/'
filename = '20160304_20160317_HighConfidenceFishingVessels.csv'
model_scores = {}
with open(sourcedir + filename,'rU') as f:
    reader = csv.DictReader(f, delimiter=',')
    for row in reader:
        model_scores[row['mmsi']] = row
        '''
        row['mmsi']
        row['label']
        row['label_score']
        row['type_score']
        '''



In [4]:

    
users = ["" for i in range(21)]
users[1]='davidkroodsma'
users[3]='davidkroodsmathe2nd'
users[4]='alexwilson'
users[5]='chris'
users[2]='bjornbergman'
users[6]='enriquetuya'
users[7]='kristinaboerder'
users[8]='vaiduke2'
users[9]='katepepler'
users[10]='stephanielewis'
users[11]='AlexCerra'
users[12]='juliecharbonneau'
users[13]='ninagalle'
users[14]='sidneyblack-rotchin'
users[15]='daivdtest'
users[16]='cailinburmaster'
users[17]='elizabethnagel'
users[18]='isabelfleisher'
users[19]='ciarawillis'
users[20]='clairechristie'
user_count = [0 for i in range (21)]



In [5]:

    
jsons = []
for j in json_files:
    f = open("jsons/" + j)
    contents = f.read()
    jsons.append(json.loads(contents))



In [6]:

    
jsons[0][0]['task_id']









    Out[6]:





4108



In [7]:

    
results = []
for j in jsons:
    r = {} # the results for this vessel
    v = {} # counts of the vessel types
    vessel_responses = ['longliner','purse_seine','trawler','reefer','multigear',
    'baddata,','otherfish','not_fishing','muti_gear','other','not_known','multiple_vessles']
    for vr in vessel_responses:
        v[vr] = 0 
    r['responses'] = 0
    r['findyes'] = 0
    r['findno'] = 0
    r['othertext'] = ''
    r['didntsearch'] = 0
    r['websitesfound'] = ''
    r['theusers'] = ''
    responses = 0
    for k in j:
        if len(k['info'])>20 and k['user_id'] != 1: #ignore david's results
            r['responses'] += 1
            user_response = json.loads(k['info'].replace("\n",""))
            r['mmsi'] = user_response["mmsi"]
            vt = user_response['vesselType'].lower()
            if vt ==  vt == 'other fishing ' or vt == "other_fishing" or vt == "other fishing" or vt == " other fishing":
                vt = 'otherfish'
            if vt == 'bad_data' or vt == 'not_enough_data': 
                vt = 'baddata'        
            if vt not in vessel_responses:
                r['othertext'] += vt+","
                vt = "other"
            v[vt]+=1
            if user_response['search_url']: 
                r['websitesfound'] += user_response['search_url']+","
            r['theusers'] += users[k['user_id']] + ","
            user_count[k['user_id']]+=1
    r['url'] = pybossa_url + str(j[0]['task_id'])
    if len(r['theusers'])>1:r['theusers'] = r['theusers'][:-1] # get rid of comma at end
    if len(r['websitesfound'])>1: r['websitesfound'] = r['websitesfound'][:-1] # get rid of comma at end
    r['vessel'] = v
    results.append(r)



In [8]:

    
# if there is agreement

outputs = []

for r in results:
    output = {}
    headers = ['mmsi','model_label','student_label','student_model_agreement', 
               'model_score','type','type_score','student_confidence','pybossa_url','othertext']
    m_scores = model_scores[r['mmsi']]
    output['mmsi'] = r['mmsi']
    output['model_label'] = m_scores['label']
    output['model_score'] = round(float(m_scores['label_score']),2)
    output['type'] = m_scores['type']
    output['type_score'] = m_scores['type_score']
    output['pybossa_url'] = r['url']
    output['othertext'] = r['othertext']
    # now figure out what the students thought it was
    v_types = []
    v = r['vessel']
    for vr in vessel_responses:
        if v[vr]: 
            if vr not in v_types: v_types.append(vr)
                
    # three casses:
    # 1) Total agreement and an answer
    # 2) Total agreement but all say "not_known"
    # 3) some think they know what it is, other say "not_known"
    # 4) disagreement
    if len(v_types) == 1 and v_types[0] != 'not_known':
        output['student_confidence'] = 1
        output['student_label'] = v_types[0]
        
    elif len(v_types) == 1 and v_types[0] == 'not_known':
        output['student_confidence'] = 0
        output['student_label'] = v_types[0]
        
        #print r['mmsi'], model_scores[r['mmsi']]['label'], v_types[0], r['url']# sum([v[i] for i in v])
        
    elif len(v_types) == 2 and "not_known" in v_types:
        total = sum([v[i] for i in v])
        not_knowns = sum([v[i] for i in v if i !='not_known'])
        output['student_confidence'] = float(not_knowns)/total
        if v_types[0] == 'not_known': output['student_label'] = v_types[1]
        else: output['student_label'] = v_types[0]
    
    else:
        total = sum([v[i] for i in v])
        not_knowns = sum([v[i] for i in v if i !='not_known'])
        output['student_confidence'] = float(not_knowns)/total*.3 # if there is disagreement, it maxes at .3
        label_string = ''
        for vessel in v_types:
            label_string += vessel+":"+str(v[vessel])+","
        label_string = label_string[:-1] # get rid of comma
        output['student_label'] = label_string
    
    if output['student_confidence'] > .3 and \
    output['student_label'].replace("_"," ") ==  output['model_label'].lower():
            output['student_model_agreement'] = 1
    else:
        output['student_model_agreement'] = 0

    
    outputs.append(output)
    
outputs = sorted(outputs, key=lambda k: -k['student_model_agreement']) 
outputs = sorted(outputs, key=lambda k: -k['student_confidence']) 

outdir = "../"
filename = "FishingVesselsV2_HighConfidenceStudents_20160314_agreement.csv"

with open(outdir+filename,'w') as f:
    f_csv = csv.DictWriter(f, headers)
    f_csv.writeheader()
    f_csv.writerows(outputs)



In [9]:

    
outputs[0]['student_label']#['model_label']

s_labels = []
m_labels = []

for o in outputs:
    if o['student_confidence']>.5:
        sl = o['student_label']
        ml = o['model_label']
        if sl not in s_labels:
            s_labels.append(sl)
        if ml not in m_labels:
            m_labels.append(ml)



In [10]:

    
print s_labels
m_labels









    



['longliner', 'trawler', 'not_fishing', 'reefer', 'purse_seine', 'other']






    Out[10]:





['Longliner', 'Trawler', 'Purse seine']



In [11]:

    
cm = {}
for s in s_labels:
    cm[s] = {}
    for m in m_labels:
        cm[s][m]=0



In [12]:

    
for o in outputs:
    if o['student_confidence']>.5:
        sl = o['student_label']
        ml = o['model_label']
        cm[sl][ml]+=1



In [13]:

    
for m in m_labels:
    print "\t"+m,
print ""
for c in cm:
    print c,
    for m in m_labels:
        print "\t"+str(cm[c][m]),
    print ""









    



	Longliner 	Trawler 	Purse seine 
purse_seine 	0 	0 	2 
reefer 	1 	1 	0 
longliner 	6 	0 	1 
other 	0 	0 	1 
trawler 	8 	13 	0 
not_fishing 	12 	13 	17



In [14]:

    
for o in outputs:
    if o['student_confidence']>.5:
        print o['othertext']









    






























































handlines,baddata,



In [15]:

    
for i in jsons[0]:
    print i['finish_time']









    



2016-03-07T23:49:54.111916
2016-03-08T15:57:25.670631
2016-03-08T19:30:00.423495



In [16]:

    
dates = {}
for jas in jsons:
    for j in jas:
        d = j['finish_time'].split('T')[0]
        if d not in dates:
            dates[d]=0
        dates[d]+=1



In [17]:

    
dates









    Out[17]:





{u'2016-03-07': 3,
 u'2016-03-08': 29,
 u'2016-03-09': 84,
 u'2016-03-10': 42,
 u'2016-03-11': 227,
 u'2016-03-12': 25,
 u'2016-03-13': 5,
 u'2016-03-14': 23,
 u'2016-03-15': 5,
 u'2016-03-18': 29,
 u'2016-03-19': 21,
 u'2016-03-20': 36,
 u'2016-03-21': 6,
 u'2016-03-22': 22,
 u'2016-03-23': 80,
 u'2016-03-25': 3,
 u'2016-03-27': 7,
 u'2016-03-30': 19,
 u'2016-04-01': 18,
 u'2016-04-04': 1}



In [18]:

    
ds = []
for d in dates:
    print d+"\t"+str(dates[d])









    



2016-03-30	19
2016-04-04	1
2016-03-27	7
2016-03-25	3
2016-04-01	18
2016-03-20	36
2016-03-21	6
2016-03-22	22
2016-03-23	80
2016-03-19	21
2016-03-18	29
2016-03-08	29
2016-03-09	84
2016-03-15	5
2016-03-07	3
2016-03-14	23
2016-03-11	227
2016-03-10	42
2016-03-13	5
2016-03-12	25



In [52]:









    Out[52]:





[u'2016-03-07',
 u'2016-03-08',
 u'2016-03-09',
 u'2016-03-10',
 u'2016-03-11',
 u'2016-03-12',
 u'2016-03-13',
 u'2016-03-14',
 u'2016-03-15',
 u'2016-03-18',
 u'2016-03-19',
 u'2016-03-20',
 u'2016-03-21',
 u'2016-03-22',
 u'2016-03-23',
 u'2016-03-25',
 u'2016-03-27',
 u'2016-03-30']



In [ ]: