In [2]:
import json
import numpy as np
import pandas as pd
import datetime
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
sns.set_context("poster")
colours = sns.color_palette()
%matplotlib inline
In [3]:
question_types = {
1: 'ranking',
2: 'select5',
3: 'pointwise_score'
}
json_data=open('../web-selection-form/data/questions.json').read()
question_mapping = json.loads(json_data)
In [4]:
# get the actual questions and answers
print(question_mapping['question1']['answers']['1'])
print(question_mapping['question1']['answers']['10'])
In [5]:
columns = ([('question1', i) for i in range(1,11)]
+[('question1', 'qustion_type'),('question1', 'end_time')]
+[('question2', i) for i in range(1,11)]
+[('question2', 'qustion_type'),('question2', 'end_time')]
+[('question3', i) for i in range(1,11)]
+[('question3', 'qustion_type'),('question3', 'end_time')]
+[('feedback', '')]
+[('start_time', '')]
+[('end_time', '')])
columns = pd.MultiIndex.from_tuples(columns)
data_df = pd.DataFrame(columns=columns)
data_df.index.name = 'respondent'
data_df
Out[5]:
In [6]:
data_path = './../responses/'
# Deal with question 1
for index, file in enumerate(os.listdir(data_path)):
if 'second_stage' in file: continue
json_data=open(os.path.join(data_path, file)).read()
data = json.loads(json_data)
data_df.set_value(index=index, col=('feedback', ''), value=data['qualitative_feedback'])
start_time = datetime.datetime.fromtimestamp(data['time_start']/1000.0)
end_time = datetime.datetime.fromtimestamp(data['time_end']/1000.0)
data_df.set_value(index=index, col=('start_time', ''), value=start_time)
data_df.set_value(index=index, col=('end_time', ''), value=end_time)
question_times = [datetime.datetime.fromtimestamp(data['question%i_time'%i]/1000.0) for i in range(1,4)]
question_times = list(np.sort(question_times + [start_time]))
for question_number in range(1,4):
question_number_string = 'question%i'%question_number
question_type = 'question%i_type'%question_number
question_time = datetime.datetime.fromtimestamp(data['question%i_time'%question_number]/1000.0)
question_selections = 'question%i_selections'%question_number
this_q = question_times.index(question_time)
time_delta = question_times[this_q] - question_times[this_q-1]
data_df.set_value(index=index, col=(question_number_string, 'qustion_type'), value=data[question_type])
data_df.set_value(index=index, col=(question_number_string, 'end_time'), value=time_delta)
if data[question_type] == 1: # 'ranking question'
for i in range(1,11):
answer = data[question_selections][i-1]
data_df.set_value(index=index, col=(question_number_string, answer), value=i)
elif data[question_type] == 2: # 'selecting question'
for i in range(1,11):
answer = 10 if i in data[question_selections] else 0
data_df.set_value(index=index, col=(question_number_string, i), value=answer)
else: # 'number question'
answers = []
for i in range(1,11):
answers.append(int(data[question_selections][i]))
for i in range(1,11):
data_df.set_value(index=index, col=(question_number_string, i), value=(answers[i-1]/float(np.sum(answers))))
data_df = data_df.drop_duplicates()
data_df = data_df.ix[~data_df.index.isin([7,17,28,29])]
In [7]:
data_df
Out[7]:
In [8]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 1')
question='question1'
print(question_mapping[question]['question_description'])
for i, ax in enumerate(axes):
agg_num = data_df[question][data_df[question]['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
q_num = agg_num.index
data = agg_num.values
ax.bar(q_num, data)
ax.set_title(question_types[i+1])
In [9]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 2')
question = 'question2'
print(question_mapping[question]['question_description'])
for i, ax in enumerate(axes):
if i == 0:
q_type = 1
elif i == 2:
q_type = 2
else:
q_type = 3
agg_num = data_df[question][data_df[question]['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
q_num = agg_num.index
data = agg_num.values
ax.bar(q_num, data)
if q_type == 1:
ax.set_title('Ranking')
elif q_type == 2:
ax.set_title('Plurality')
else:
ax.set_title('Cardinal Score')
if i == 0:
ax.set_ylabel('cumulative score')
ax.set_xlabel('point id')
ax.set_xticks(np.arange(1,11)+0.4)
ax.set_xticklabels(np.arange(1,11), fontdict={'size':14})
In [10]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 3')
question = 'question3'
print(question_mapping[question]['question_description'])
for i, ax in enumerate(axes):
if i == 0:
q_type = 1
elif i == 2:
q_type = 2
else:
q_type = 3
agg_num = data_df[question][data_df[question]['qustion_type'] == q_type][[1,2,3,4,5,6,7,8,9,10]].sum()
agg_num.sort_index(inplace=True)
q_num = agg_num.index
data = agg_num.values
ax.bar(q_num, data)
if q_type == 1:
ax.set_title('Ranking')
elif q_type == 2:
ax.set_title('Plurality')
else:
ax.set_title('Cardinal Score')
if i == 0:
ax.set_ylabel('cumulative score')
ax.set_xlabel('point id')
ax.set_xticks(np.arange(1,11)+0.4)
ax.set_xticklabels(np.arange(1,11), fontdict={'size':14})
fig.tight_layout()
In [11]:
print(question_mapping['question1']['answers']['6'])
print(question_mapping['question1']['answers']['9'])
print(question_mapping['question1']['answers']['8'])
print
print(question_mapping['question2']['answers']['3'])
print(question_mapping['question2']['answers']['5'])
print(question_mapping['question2']['answers']['1'])
print
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['10'])
View the qualitative feedback:
In [12]:
for i, f in data_df['feedback'].iteritems():
print(f)
print('------------------------------------------------------------------------')
In [13]:
data_df['question1'][data_df['question1']['qustion_type'] == 1]
Out[13]:
In [16]:
ans0 = []
ans1 = []
# j = 0
import scipy.stats as stats
for question in ['question1','question2','question3']:
for q_type in [1,3]:
scores = [[] for i in range(10)]
# q_type = 1
# question = 'question3'
for i,row in data_df[question][data_df[question]['qustion_type'] == q_type].iterrows():
row = row.ix[0:10]
row = (row/row.max()) * 10 if q_type == 3 else row
j = 0
for _,k in row.iteritems():
scores[j].append(k)
j += 1
res = stats.mode(scores, axis=1)
sorted(np.array([np.mean(scores, axis=1), np.std(scores, axis=1)]).T, key=lambda x: x[1])
# np.array([np.mean(scores, axis=1), np.std(scores, axis=1)]).T
ans0.append(np.array([np.mean(scores, axis=1), np.std(scores, axis=1)])[0,:])
ans1.append(np.array([np.mean(scores, axis=1), np.std(scores, axis=1)])[1,:])
plt.scatter(ans0, ans1)
plt.xlabel('score mean')
plt.ylabel('score std')
Out[16]:
In [17]:
np.array([np.mean(scores, axis=1), np.std(scores, axis=1)])
Out[17]:
In [18]:
def set_regret(profile, S, T):
'''
Minimax Regret Set (naive method)
'''
if len(T) == 0:
regret = 0;
return regret
In [19]:
T = set([2,3,7,11])
S = set([1,5])
In [20]:
len(T) == 0
Out[20]:
In [21]:
profile = [list(np.random.permutation([1,2,3,4,5,6,7,8,9,10])) for i in range(10)]
# first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
# first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])
# regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
# regret
In [19]:
import itertools
k = 3
permutations1 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)
In [20]:
output_regret = np.inf
permutations = None
for S in permutations1:
max_reg = 0
permutation = None
permutations2 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)
for T in permutations2:
first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])
regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
if regret > max_reg:
max_reg = regret
if max_reg < output_regret:
output_regret = max_reg
permutations = S
In [20]:
output_regret
In [21]:
permutations
In [ ]:
In [22]:
# question_rank = data_df['question1'][data_df['question1']['qustion_type'] == 1]
# data_df[['question1', 'question2' , 'question3']][data_df[['question1', 'question2' , 'question3'], 'qustion_type'] == 1]
q_type = 1
question_rank = [data_df['question1'][data_df['question1']['qustion_type'] == q_type],
data_df['question2'][data_df['question2']['qustion_type'] == q_type],
data_df['question3'][data_df['question3']['qustion_type'] == q_type]]
q_type = 2
question_select = [data_df['question1'][data_df['question1']['qustion_type'] == q_type],
data_df['question2'][data_df['question2']['qustion_type'] == q_type],
data_df['question3'][data_df['question3']['qustion_type'] == q_type]]
q_type = 3
question_number = [data_df['question1'][data_df['question1']['qustion_type'] == q_type],
data_df['question2'][data_df['question2']['qustion_type'] == q_type],
data_df['question3'][data_df['question3']['qustion_type'] == q_type]]
In [23]:
rank = []
select = []
number = []
rank_std = []
select_std = []
number_std = []
for df in question_rank:
acceptable_times = df[((df.end_time < datetime.timedelta(minutes=20)) & (df.end_time > datetime.timedelta(minutes=0)))]
times = []
for i, t in acceptable_times.end_time.iteritems():
times.append(t.seconds)
rank.append(times)
for df in question_select:
acceptable_times = df[((df.end_time < datetime.timedelta(minutes=20)) & (df.end_time > datetime.timedelta(minutes=0)))]
times = []
for i, t in acceptable_times.end_time.iteritems():
times.append(t.seconds)
select.append(times)
for df in question_number:
acceptable_times = df[((df.end_time < datetime.timedelta(minutes=20)) & (df.end_time > datetime.timedelta(minutes=0)))]
times = []
for i, t in acceptable_times.end_time.iteritems():
times.append(t.seconds)
number.append(times)
In [24]:
# indexes = np.array([1,1.2,1.4])
plt.figure(figsize=(5,4))
indexes = np.array([1,2,3])
answers = np.array([[np.mean(r) for r in rank], [np.mean(r) for r in number], [np.mean(r) for r in select]])
std = np.array([[np.std(r) for r in rank], [np.std(r) for r in select], [np.std(r) for r in number]])
plt.bar(indexes, answers[0,:], yerr=std[0,:], width=0.08, linewidth=0, color=[colours[0], colours[0], colours[0]], label='question1', ecolor='black', error_kw={'linewidth': 1.5}, alpha=0.8)
plt.bar(indexes+0.08, answers[1,:], yerr=std[1,:], width=0.08, color=[colours[1], colours[1], colours[1]], label='question2', ecolor='black', error_kw={'linewidth': 1.5}, alpha=0.8)
plt.bar(indexes+0.16, answers[2,:], yerr=std[2,:], width=0.08, color=[colours[2], colours[2], colours[2]], label='question3', ecolor='black', error_kw={'linewidth': 1.5}, alpha=0.8)
plt.ylabel('time to complete (s)')
plt.legend(loc='best')
plt.xticks([1.2,2.2,3.2], ['Ranking', 'Cardinal Score', 'Plurality'])
plt.show()
In [25]:
plt.boxplot(rank + select + number)
plt.xticks([2,5,8], ['rank', 'select', 'number'])
plt.show()
In [26]:
question_times = []
type_times = []
types = []
times = []
for index, row in data_df.iterrows():
q1_time = row.question1.end_time.seconds
q2_time = row.question2.end_time.seconds
q3_time = row.question3.end_time.seconds
# if q1_time < 60*20 and q2_time < 60*20 and q2_time < 60*20:
if q1_time > 0 and q2_time > 0 and q2_time > 0:
types_ = [0,0,0]
types_[row.question1.qustion_type - 1] = row.question1.end_time.seconds
types_[row.question2.qustion_type - 1] = row.question2.end_time.seconds
types_[row.question3.qustion_type - 1] = row.question3.end_time.seconds
question_times.append(np.argsort([q1_time, q2_time, q3_time]))
type_times.append(np.argsort(types_))
times.append([q1_time, q2_time, q3_time])
types.append([row.question1.qustion_type, row.question2.qustion_type, row.question3.qustion_type])
question_times = np.array(question_times)
type_times = np.array(type_times)
In [27]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,4))
ax1.hist(question_times, label=['rank', 'select' , 'number'])
ax1.set_xticks([0,1,2])
ax1.set_xticklabels(['Topic1', 'Topic2', 'Topic3'])
ax1.legend(loc='best')
ax2.hist(type_times, label=['admissions', 'interview' , 'gaming'], alpha=0.8)
ax2.set_xticks([0,1,2])
ax2.set_xticklabels(['Ranking', 'select', 'number'])
ax2.legend(loc='right')
plt.show()
In [28]:
for q in range(1,4):
df = data_df['question{}'.format(q)][data_df['question{}'.format(q)].qustion_type == 2][[1,2,3,4,5,6,7,8,9,10]].sum()
df.sort_values(inplace=True)
print(df)
selections = list(df.index[6:])
print('Question{}, Type: Plurality'.format(q))
print(selections)
for i in selections:
print('"' +question_mapping['question{}'.format(q)]['answers']['{}'.format(i)] + '",')
print
In [29]:
qualitative_feedback = np.array([
[-1, 0, 0],
[-1, 1, 1],
[-1, 1, 0],
[ 0, 0, 1],
[ 0, 1, 0],
[ 0, 1,-1],
[-1, 0, 1],
[ 0, 1,-1],
[ 1, 0, 0],
[-1, 1, 0],
[ 1, 1, 0],
[-1, 1, 1],
[-1, 0, 1],
[ 1, 1,-1],
[ 1, 0, 0],
[ 0, 1, 0],
[-1, 0, 0]
])
dislike = []
neutral = []
like = []
for doc in qualitative_feedback:
for i, rating in enumerate(doc):
if rating == -1:
dislike.append(i+1)
elif rating == 0:
neutral.append(i+1)
else:
like.append(i+1)
In [30]:
fig, ax = plt.subplots(1,1, figsize=(5,4))
ax.hist([dislike,like,neutral] , label=['Dislike', 'Neutral', 'Like'], color=[colours[0], colours[1], colours[2]], alpha=0.8)
# ax.set_title('Qualitative Feedback', fontdict={'size': 22})
ax.set_xticks([1.1,2.1,2.9])
ax.set_xticklabels(['Ranking', 'Cardinal Score', 'Plurality'], fontdict={'size': 16})
ax.legend(loc='center right', bbox_to_anchor=(0.55, 0.5), fontsize=16)
plt.show()
In [31]:
data_df
Out[31]:
In [34]:
result = []
for i in range(10000):
row = np.random.choice(np.arange(1,11), size=10)
res = dict()
dist = []
for row in [np.random.choice(np.arange(1,11), size=10, replace=False) for i in range(37)]:
for window in np.concatenate([np.arange(0,3), np.arange(5,8)]):
top3 = row[window:window+3]
# print(top3)
for perm in itertools.combinations(top3, 2):
if (tuple(perm) in res):
res[tuple(perm)] += 1
elif tuple([perm[1], perm[0]]) in res:
res[tuple([perm[1], perm[0]])] += 1
else:
res[tuple(perm)] = 1
dist.append(np.abs(perm[0] - perm[1]))
result.append(dist)
# plt.hist(dist, bins=8)
# plt.xlim([0,9])
# # plt.ylim([0,9])
In [35]:
# for i in result:
import collections
counts = []
for i in result:
counts.append([c for i,c in collections.Counter(i).iteritems()])
In [36]:
means = np.array(counts).mean(axis=0)
stds = np.array(counts).std(axis=0)
In [57]:
import itertools
count_of_ones = []
fig, axes = plt.subplots(1,3,figsize=(12,4))
for k, (ax, question) in enumerate(zip(axes, ['question1','question2','question3'])):
res = dict()
dist = []
# question = 'question1'
for ix, group in data_df[question].groupby('qustion_type'):
# print(group.shape)
gp = group[[1,2,3,4,5,6,7,8,9,10]]
for i, row in gp.iterrows():
# print(row.sort_values())
for window in np.concatenate([np.arange(0,3), np.arange(5,8)]):
top3 = row.sort_values().index[window:window+3]
# print(top3)
for perm in itertools.combinations(top3, 2):
if (tuple(perm) in res):
res[tuple(perm)] += 1
elif tuple([perm[1], perm[0]]) in res:
res[tuple([perm[1], perm[0]])] += 1
else:
res[tuple(perm)] = 1
dist.append(np.abs(perm[0] - perm[1]))
count_of_ones.append(np.sum(np.array(dist) == 1))
ax.hist(dist, bins=9, label='Observed Distance')
ax.set_title('Topic {}'.format(k+1))
ax.set_xlabel('distance')
if k == 0:
ax.set_ylabel('frequency')
ax.bar(np.arange(1,10),means, yerr=stds, color=colours[1], alpha=0.5, width=1, label='Expected Distance')
ax.legend(loc='center right', bbox_to_anchor=(2, 0.5), fontsize=16)
fig.tight_layout()
plt.show()
In [58]:
result = np.array(result)
In [59]:
ones = []
for r in result:
ones.append(np.sum(r == 1))
In [60]:
plt.hist(ones)
plt.show()
In [61]:
mean = np.mean(ones)
std = np.std(ones)
In [62]:
mean
Out[62]:
In [67]:
z_scores = (count_of_ones-mean)/std
In [68]:
count_of_ones
Out[68]:
In [69]:
p_values = stats.norm.sf(np.abs(z_scores))
p_values
Out[69]:
In [70]:
2.70081974e-03
Out[70]:
In [ ]: