In [126]:
import json
import numpy as np
import pandas as pd
import datetime
import os
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.stats import kendalltau
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
%matplotlib inline
In [70]:
question_types = {
1: 'ranking',
2: 'select5',
3: 'pointwise_score'
}
json_data=open('../web-selection-form/data/questions.json').read()
question_mapping = json.loads(json_data)
In [71]:
# get the actual questions and answers
print(question_mapping['question1']['answers']['1'])
print(question_mapping['question1']['answers']['10'])
In [72]:
columns = {
'question1': dict([[i, np.nan] for i in (range(1,11)+['qustion_type'])]),
'question2': dict([[i, np.nan] for i in (range(1,11)+['qustion_type'])]),
'question3': dict([[i, np.nan] for i in (range(1,11)+['qustion_type'])]),
}
columns = ([('question1', i) for i in range(1,11)]
+[('question1', 'qustion_type'),('question1', 'end_time')]
+[('question2', i) for i in range(1,11)]
+[('question2', 'qustion_type'),('question2', 'end_time')]
+[('question3', i) for i in range(1,11)]
+[('question3', 'qustion_type'),('question3', 'end_time')]
+[('feedback', '')]
+[('start_time', '')]
+[('end_time', '')])
columns = pd.MultiIndex.from_tuples(columns)
data_df = pd.DataFrame(columns=columns)
data_df.index.name = 'respondant'
data_df
Out[72]:
In [82]:
data_path = './../responses/'
# Deal with question 1
for index, file in enumerate(os.listdir(data_path)):
json_data=open(os.path.join(data_path, file)).read()
data = json.loads(json_data)
data_df.set_value(index=index, col=('feedback', ''), value=data['qualitative_feedback'])
start_time = datetime.datetime.fromtimestamp(data['time_start']/1000.0)
end_time = datetime.datetime.fromtimestamp(data['time_end']/1000.0)
data_df.set_value(index=index, col=('start_time', ''), value=start_time)
data_df.set_value(index=index, col=('end_time', ''), value=end_time)
question_times = [datetime.datetime.fromtimestamp(data['question%i_time'%i]/1000.0) for i in range(1,4)]
question_times = list(np.sort(question_times + [start_time]))
for question_number in range(1,4):
question_number_string = 'question%i'%question_number
question_type = 'question%i_type'%question_number
question_time = datetime.datetime.fromtimestamp(data['question%i_time'%question_number]/1000.0)
question_selections = 'question%i_selections'%question_number
this_q = question_times.index(question_time)
time_delta = question_times[this_q] - question_times[this_q-1]
data_df.set_value(index=index, col=(question_number_string, 'qustion_type'), value=data[question_type])
data_df.set_value(index=index, col=(question_number_string, 'end_time'), value=time_delta)
if data[question_type] == 1: # 'ranking question'
for i in range(1,11):
answer = data[question_selections][i-1]
data_df.set_value(index=index, col=(question_number_string, answer), value=i)
elif data[question_type] == 2: # 'selecting question'
for i in range(1,11):
answer = 10 if i in data[question_selections] else 0
data_df.set_value(index=index, col=(question_number_string, i), value=answer)
else: # 'number question'
answers = []
for i in range(1,11):
answers.append(int(data[question_selections][i]))
for i in range(1,11):
data_df.set_value(index=index, col=(question_number_string, i), value=(answers[i-1]/float(np.sum(answers))))
data_df = data_df.drop_duplicates()
data_df = data_df.drop([7,17,28,29], axis=0)
In [83]:
data_df['question1'][data_df['question1']['qustion_type'] == 1]
Out[83]:
In [7]:
data_df
Out[7]:
In [8]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 1')
print(question_mapping['question1']['question_description'])
for i, ax in enumerate(axes):
agg_num = data_df['question1'][data_df['question1']['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
q_num = agg_num.index
data = agg_num.values
ax.bar(q_num, data)
ax.set_title(question_types[i+1])
In [9]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 2')
print(question_mapping['question2']['question_description'])
for i, ax in enumerate(axes):
agg_num = data_df['question2'][data_df['question1']['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
q_num = agg_num.index
data = agg_num.values
ax.bar(q_num, data)
ax.set_title(question_types[i+1])
In [10]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 3')
print(question_mapping['question3']['question_description'])
for i, ax in enumerate(axes):
agg_num = data_df['question3'][data_df['question1']['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
q_num = agg_num.index
data = agg_num.values
ax.bar(q_num, data)
ax.set_title(question_types[i+1])
In [11]:
print(question_mapping['question1']['answers']['6'])
print(question_mapping['question1']['answers']['9'])
print(question_mapping['question1']['answers']['8'])
print
print(question_mapping['question2']['answers']['3'])
print(question_mapping['question2']['answers']['5'])
print(question_mapping['question2']['answers']['1'])
print
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['10'])
View the qualitative feedback:
In [16]:
for i, f in data_df['feedback'].iteritems():
print(f)
print
function [regret] = set_regret(profile,S,T) % Regret of set S for set T
if isempty(T) regret = 0; return end
[I,J] = find(ismember(profile,S)); [~,ind] = unique(I,'first'); first_indices_S = J(ind);
[I,J] = find(ismember(profile,T)); [~,ind] = unique(I,'first'); first_indices_T = J(ind);
regret = sum((first_indices_T < first_indices_S).*(1./first_indices_T)); end
In [ ]:
def set_regret(profile, S, T):
'''
Minimax Regret Set (naive method)
'''
if len(T) == 0:
regret = 0;
return regret
In [46]:
T = set([2,3,7,11])
S = set([1,5])
In [47]:
len(T) == 0
Out[47]:
In [85]:
profile = [list(np.random.permutation([1,2,3,4,5,6,7,8,9,10])) for i in range(10)]
# first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
# first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])
# regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
# regret
In [100]:
import itertools
k = 3
permutations1 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)
In [101]:
output_regret = np.inf
permutations = None
for S in permutations1:
max_reg = 0
permutation = None
permutations2 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)
for T in permutations2:
first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])
regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
if regret > max_reg:
max_reg = regret
if max_reg < output_regret:
output_regret = max_reg
permutations = S
In [102]:
output_regret
Out[102]:
In [103]:
permutations
Out[103]:
In [15]:
import pickle
In [84]:
q1_sort_profile = data_df['question1'][data_df['question1']['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
with open('./q1_sort_profile.p','wb') as file_:
pickle.dump(q1_sort_profile, file_)
In [85]:
q2_sort_profile = data_df['question2'][data_df['question2']['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
with open('./q2_sort_profile.p','wb') as file_:
pickle.dump(q2_sort_profile, file_)
In [86]:
q3_sort_profile = data_df['question3'][data_df['question3']['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
with open('./q3_sort_profile.p','wb') as file_:
pickle.dump(q3_sort_profile, file_)
In [87]:
def rate_to_rank(question_num):
rate_profile = data_df['question{}'.format(question_num)][data_df['question{}'.format(question_num)]['qustion_type'] == 3][[1,2,3,4,5,6,7,8,9,10]].values
rate_list = []
for i in range(len(rate_profile)):
unique, counts = np.unique(rate_profile[i], return_counts=True)
# print "before", unique, counts
for k in range(len(counts)):
if counts[k]>1:
cntr = 1
while np.sum(rate_profile[i]==unique[k])>1:
indices = np.where(rate_profile[i]==unique[k])
idx = np.random.choice(indices[0])
rate_profile[i][idx] += cntr*.0000001
cntr+=1
ranking = zip(list(rate_profile[i]), range(10))
# print(sorted(ranking))
# print([x[1]+1 for x in reversed(sorted(ranking))])
rate_list.append([x[1]+1 for x in reversed(sorted(ranking))])
rate_list = np.array(rate_list)
# print(rate_list)
with open('./q{}_rate_profile.p'.format(question_num),'wb') as file_:
pickle.dump(rate_list, file_)
In [88]:
rate_to_rank(1)
rate_to_rank(2)
rate_to_rank(3)
In [95]:
for k in range(1,4):
for typ in ['rate','sort']:
with open('./q{}_{}_results.p'.format(k,typ), 'rb') as file_:
results = pickle.load(file_)
print("Question {}, Type: {}".format(k,typ))
print results
for i in range(len(results)):
print(question_mapping['question{}'.format(k)]['answers']['{}'.format(results[i])])
print('\n')
# print(question_mapping['question1']['answers']['6'])
# print(question_mapping['question1']['answers']['9'])
# print(question_mapping['question1']['answers']['8'])
# print
# print(question_mapping['question2']['answers']['3'])
# print(question_mapping['question2']['answers']['5'])
# print(question_mapping['question2']['answers']['1'])
# print
# print(question_mapping['question3']['answers']['6'])
# print(question_mapping['question3']['answers']['6'])
# print(question_mapping['question3']['answers']['10'])
In [120]:
def calc_spearman(question_num):
rate_profile = data_df['question{}'.format(question_num)][data_df['question{}'.format(question_num)]['qustion_type'] == 3][[1,2,3,4,5,6,7,8,9,10]].values
rate_list = []
for i in range(len(rate_profile)):
unique, counts = np.unique(rate_profile[i], return_counts=True)
# print "before", unique, counts
for k in range(len(counts)):
if counts[k]>1:
cntr = 1
while np.sum(rate_profile[i]==unique[k])>1:
indices = np.where(rate_profile[i]==unique[k])
idx = np.random.choice(indices[0])
rate_profile[i][idx] += cntr*.0000001
cntr+=1
ranking = zip(list(rate_profile[i]), range(10))
# print(sorted(ranking))
# print([x[1]+1 for x in reversed(sorted(ranking))])
rate_list.append([x[1]+1 for x in reversed(sorted(ranking))])
rate_list = np.array(rate_list)
srs = []
for i in range(len(rate_list)):
for j in range(i+1,len(rate_list)):
# print spearmanr(rate_list[i],rate_list[j])
srs.append(spearmanr(rate_list[i],rate_list[j]))
return srs
In [121]:
sr_list_1 = calc_spearman(1)
print(np.mean([sr_list_1[i][0] for i in range(len(sr_list_1))]))
sr_list_2 = calc_spearman(2)
print(np.mean([sr_list_2[i][0] for i in range(len(sr_list_2))]))
sr_list_3 = calc_spearman(3)
print(np.mean([sr_list_3[i][0] for i in range(len(sr_list_3))]))
In [127]:
def avg_scc_q(question_num):
sort_profile = data_df['question{}'.format(question_num)][data_df['question{}'.format(question_num)]['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
srs = []
wc = []
kt = []
mw = []
for i in range(len(sort_profile)):
for j in range(i+1,len(sort_profile)):
# print spearmanr(rate_list[i],rate_list[j])
srs.append(spearmanr(sort_profile[i],sort_profile[j]))
wc.append(wilcoxon(sort_profile[i],sort_profile[j]))
kt.append(kendalltau(sort_profile[i],sort_profile[j]))
mw.append(mannwhitneyu(sort_profile[i],sort_profile[j]))
print(np.mean([srs[i][0] for i in range(len(srs))]))
print(np.mean([wc[i][0] for i in range(len(wc))]))
print(np.mean([kt[i][0] for i in range(len(kt))]))
print(np.mean([mw[i][0] for i in range(len(mw))]))
In [128]:
avg_scc_q(1)
avg_scc_q(2)
avg_scc_q(3)
In [ ]: