In [2]:
    
import json
import numpy as np
import pandas as pd
import datetime
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
sns.set_context("poster")
colours = sns.color_palette()
%matplotlib inline
    
    
In [3]:
    
question_types = {
    1: 'ranking',
    2: 'select5',
    3: 'pointwise_score'
}
json_data=open('../web-selection-form/data/questions.json').read()
question_mapping = json.loads(json_data)
    
In [4]:
    
# get the actual questions and answers
print(question_mapping['question1']['answers']['1'])
print(question_mapping['question1']['answers']['10'])
    
    
In [5]:
    
columns = ([('question1', i) for i in range(1,11)] 
            +[('question1', 'qustion_type'),('question1', 'end_time')]
            +[('question2', i) for i in range(1,11)]
            +[('question2', 'qustion_type'),('question2', 'end_time')]
            +[('question3', i) for i in range(1,11)]
            +[('question3', 'qustion_type'),('question3', 'end_time')]
            +[('feedback', '')]
            +[('start_time', '')]
            +[('end_time', '')])
columns = pd.MultiIndex.from_tuples(columns)
data_df = pd.DataFrame(columns=columns)
data_df.index.name = 'respondent'
data_df
    
    Out[5]:
In [6]:
    
data_path = './../responses/'
# Deal with question 1
    
for index, file in enumerate(os.listdir(data_path)):
    
    if 'second_stage' in file: continue
        
    json_data=open(os.path.join(data_path, file)).read()
    data = json.loads(json_data)
    data_df.set_value(index=index, col=('feedback', ''), value=data['qualitative_feedback'])
    start_time = datetime.datetime.fromtimestamp(data['time_start']/1000.0)
    end_time = datetime.datetime.fromtimestamp(data['time_end']/1000.0)
    
    data_df.set_value(index=index, col=('start_time', ''), value=start_time)
    data_df.set_value(index=index, col=('end_time', ''), value=end_time)
    question_times = [datetime.datetime.fromtimestamp(data['question%i_time'%i]/1000.0) for i in range(1,4)]
    question_times = list(np.sort(question_times + [start_time]))
    
    for question_number in range(1,4):
        question_number_string = 'question%i'%question_number
        question_type = 'question%i_type'%question_number
        question_time = datetime.datetime.fromtimestamp(data['question%i_time'%question_number]/1000.0)
        question_selections = 'question%i_selections'%question_number
        this_q = question_times.index(question_time)
        time_delta = question_times[this_q] - question_times[this_q-1]
        
        data_df.set_value(index=index, col=(question_number_string, 'qustion_type'), value=data[question_type])
        data_df.set_value(index=index, col=(question_number_string, 'end_time'), value=time_delta)
        if data[question_type] == 1: # 'ranking question'
            for i in range(1,11):
                answer = data[question_selections][i-1]
                data_df.set_value(index=index, col=(question_number_string, answer), value=i)
        elif data[question_type] == 2: # 'selecting question'
            for i in range(1,11):
                answer = 10 if i in data[question_selections] else 0
                data_df.set_value(index=index, col=(question_number_string, i), value=answer)
        else: # 'number question'
            answers = []
            for i in range(1,11):
                answers.append(int(data[question_selections][i]))
                
            for i in range(1,11):
                
                data_df.set_value(index=index, col=(question_number_string, i), value=(answers[i-1]/float(np.sum(answers))))
        
data_df = data_df.drop_duplicates()
data_df = data_df.ix[~data_df.index.isin([7,17,28,29])]
    
In [7]:
    
data_df
    
    Out[7]:
In [8]:
    
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 1')
question='question1'
print(question_mapping[question]['question_description'])
for i, ax in enumerate(axes):
    
    agg_num = data_df[question][data_df[question]['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
    
    q_num = agg_num.index
    data = agg_num.values
    ax.bar(q_num, data)
    ax.set_title(question_types[i+1])
    
    
    
In [9]:
    
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 2')
question = 'question2'
print(question_mapping[question]['question_description'])
for i, ax in enumerate(axes):
    
    if i == 0:
        q_type = 1
    elif i == 2:
        q_type = 2
    else: 
        q_type = 3
    
    agg_num = data_df[question][data_df[question]['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
    
    q_num = agg_num.index
    data = agg_num.values
    ax.bar(q_num, data)
    
    if q_type == 1:
        ax.set_title('Ranking')
    elif q_type == 2:
        ax.set_title('Plurality')
    else:
        ax.set_title('Cardinal Score')
        
    if i == 0:
        ax.set_ylabel('cumulative score')
    ax.set_xlabel('point id')
    ax.set_xticks(np.arange(1,11)+0.4)
    ax.set_xticklabels(np.arange(1,11), fontdict={'size':14})
    
    
    
In [10]:
    
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 3')
question = 'question3'
print(question_mapping[question]['question_description'])
for i, ax in enumerate(axes):
    
    if i == 0:
        q_type = 1
    elif i == 2:
        q_type = 2
    else: 
        q_type = 3
        
    agg_num = data_df[question][data_df[question]['qustion_type'] == q_type][[1,2,3,4,5,6,7,8,9,10]].sum()
    
    agg_num.sort_index(inplace=True)
    q_num = agg_num.index
    data = agg_num.values
    ax.bar(q_num, data)
    
    if q_type == 1:
        ax.set_title('Ranking')
    elif q_type == 2:
        ax.set_title('Plurality')
    else:
        ax.set_title('Cardinal Score')
        
    if i == 0:
        ax.set_ylabel('cumulative score')
    ax.set_xlabel('point id')
    ax.set_xticks(np.arange(1,11)+0.4)
    ax.set_xticklabels(np.arange(1,11), fontdict={'size':14})
fig.tight_layout()
    
    
    
In [11]:
    
print(question_mapping['question1']['answers']['6'])
print(question_mapping['question1']['answers']['9'])
print(question_mapping['question1']['answers']['8'])
print
print(question_mapping['question2']['answers']['3'])
print(question_mapping['question2']['answers']['5'])
print(question_mapping['question2']['answers']['1'])
print
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['10'])
    
    
View the qualitative feedback:
In [12]:
    
for i, f in data_df['feedback'].iteritems():
    
    print(f)
    print('------------------------------------------------------------------------')
    
    
In [13]:
    
data_df['question1'][data_df['question1']['qustion_type'] == 1]
    
    Out[13]:
In [16]:
    
ans0 = []
ans1 = []
# j = 0
import scipy.stats as stats
for question in ['question1','question2','question3']:
    for q_type in [1,3]:
        scores = [[] for i in range(10)]
#         q_type = 1
#         question = 'question3'
        for i,row in data_df[question][data_df[question]['qustion_type'] == q_type].iterrows():
            row = row.ix[0:10]
            row = (row/row.max()) * 10 if q_type == 3 else row
            j = 0
            for _,k in row.iteritems():
                scores[j].append(k)
                j += 1
        
        res = stats.mode(scores, axis=1)
        sorted(np.array([np.mean(scores, axis=1), np.std(scores, axis=1)]).T, key=lambda x: x[1])
        # np.array([np.mean(scores, axis=1), np.std(scores, axis=1)]).T
        ans0.append(np.array([np.mean(scores, axis=1), np.std(scores, axis=1)])[0,:])
        ans1.append(np.array([np.mean(scores, axis=1), np.std(scores, axis=1)])[1,:])
plt.scatter(ans0, ans1)
plt.xlabel('score mean')
plt.ylabel('score std')
    
    Out[16]:
    
In [17]:
    
np.array([np.mean(scores, axis=1), np.std(scores, axis=1)])
    
    Out[17]:
In [18]:
    
def set_regret(profile, S, T):
    '''
    Minimax Regret Set (naive method)
    '''
    if len(T) == 0:
        regret = 0;
        return regret
    
In [19]:
    
T = set([2,3,7,11])
S = set([1,5])
    
In [20]:
    
len(T) == 0
    
    Out[20]:
In [21]:
    
profile = [list(np.random.permutation([1,2,3,4,5,6,7,8,9,10])) for i in range(10)]
# first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
# first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])
# regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
# regret
    
In [19]:
    
import itertools
k = 3
permutations1 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)
    
In [20]:
    
output_regret = np.inf
permutations = None
for S in permutations1:
    
    max_reg = 0
    permutation = None
    
    permutations2 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)
    for T in permutations2:
        
        first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
        first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])
        regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
        
        if regret > max_reg:
            
            max_reg = regret
            
    if max_reg < output_regret:
        output_regret = max_reg
        permutations = S
    
    
In [20]:
    
output_regret
    
    
In [21]:
    
permutations
    
    
In [ ]:
    
    
In [22]:
    
# question_rank = data_df['question1'][data_df['question1']['qustion_type'] == 1]
# data_df[['question1', 'question2' , 'question3']][data_df[['question1', 'question2' , 'question3'], 'qustion_type'] == 1]
q_type = 1
question_rank = [data_df['question1'][data_df['question1']['qustion_type'] == q_type],
                data_df['question2'][data_df['question2']['qustion_type'] == q_type],
                data_df['question3'][data_df['question3']['qustion_type'] == q_type]]
q_type = 2
question_select = [data_df['question1'][data_df['question1']['qustion_type'] == q_type],
                data_df['question2'][data_df['question2']['qustion_type'] == q_type],
                data_df['question3'][data_df['question3']['qustion_type'] == q_type]]
q_type = 3
question_number = [data_df['question1'][data_df['question1']['qustion_type'] == q_type],
                data_df['question2'][data_df['question2']['qustion_type'] == q_type],
                data_df['question3'][data_df['question3']['qustion_type'] == q_type]]
    
In [23]:
    
rank = []
select = []
number = []
rank_std = []
select_std = []
number_std = []
for df in question_rank:
    
    acceptable_times = df[((df.end_time < datetime.timedelta(minutes=20)) & (df.end_time > datetime.timedelta(minutes=0)))]
    
    times = []
    for i, t in acceptable_times.end_time.iteritems():
        
        times.append(t.seconds)
    
    rank.append(times)
    
for df in question_select:
    
    acceptable_times = df[((df.end_time < datetime.timedelta(minutes=20)) & (df.end_time > datetime.timedelta(minutes=0)))]
    
    times = []
    for i, t in acceptable_times.end_time.iteritems():
        
        times.append(t.seconds)
    
    select.append(times)
    
for df in question_number:
    
    acceptable_times = df[((df.end_time < datetime.timedelta(minutes=20)) & (df.end_time > datetime.timedelta(minutes=0)))]
    
    times = []
    for i, t in acceptable_times.end_time.iteritems():
        
        times.append(t.seconds)
    
    number.append(times)
    
In [24]:
    
# indexes = np.array([1,1.2,1.4])
plt.figure(figsize=(5,4))
indexes = np.array([1,2,3])
answers = np.array([[np.mean(r) for r in rank], [np.mean(r) for r in number], [np.mean(r) for r in select]])
std = np.array([[np.std(r) for r in rank], [np.std(r) for r in select], [np.std(r) for r in number]])
plt.bar(indexes, answers[0,:], yerr=std[0,:], width=0.08, linewidth=0, color=[colours[0], colours[0], colours[0]], label='question1', ecolor='black', error_kw={'linewidth': 1.5},  alpha=0.8)
plt.bar(indexes+0.08, answers[1,:], yerr=std[1,:], width=0.08, color=[colours[1], colours[1], colours[1]], label='question2', ecolor='black', error_kw={'linewidth': 1.5}, alpha=0.8)
plt.bar(indexes+0.16, answers[2,:], yerr=std[2,:], width=0.08, color=[colours[2], colours[2], colours[2]], label='question3', ecolor='black', error_kw={'linewidth': 1.5}, alpha=0.8)
plt.ylabel('time to complete (s)')
plt.legend(loc='best')
plt.xticks([1.2,2.2,3.2], ['Ranking', 'Cardinal Score', 'Plurality'])
plt.show()
    
    
In [25]:
    
plt.boxplot(rank + select + number)
plt.xticks([2,5,8], ['rank', 'select', 'number'])
plt.show()
    
    
In [26]:
    
question_times = []
type_times = []
types = []
times = []
for index, row in data_df.iterrows():
    
    q1_time = row.question1.end_time.seconds
    q2_time = row.question2.end_time.seconds
    q3_time = row.question3.end_time.seconds
    
#     if q1_time < 60*20 and q2_time < 60*20 and q2_time < 60*20:
        
    if q1_time > 0 and q2_time > 0 and q2_time > 0:
        types_ = [0,0,0]    
        types_[row.question1.qustion_type - 1] = row.question1.end_time.seconds
        types_[row.question2.qustion_type - 1] = row.question2.end_time.seconds
        types_[row.question3.qustion_type - 1] = row.question3.end_time.seconds
        question_times.append(np.argsort([q1_time, q2_time, q3_time]))
        type_times.append(np.argsort(types_))
        times.append([q1_time, q2_time, q3_time])
        types.append([row.question1.qustion_type, row.question2.qustion_type, row.question3.qustion_type])
question_times = np.array(question_times)
type_times = np.array(type_times)
    
In [27]:
    
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,4))
ax1.hist(question_times, label=['rank', 'select' , 'number'])
ax1.set_xticks([0,1,2])
ax1.set_xticklabels(['Topic1', 'Topic2', 'Topic3'])
ax1.legend(loc='best')
ax2.hist(type_times, label=['admissions', 'interview' , 'gaming'], alpha=0.8)
ax2.set_xticks([0,1,2])
ax2.set_xticklabels(['Ranking', 'select', 'number'])
ax2.legend(loc='right')
plt.show()
    
    
In [28]:
    
for q in range(1,4):
    
    df = data_df['question{}'.format(q)][data_df['question{}'.format(q)].qustion_type == 2][[1,2,3,4,5,6,7,8,9,10]].sum()
    df.sort_values(inplace=True)
    print(df)
    selections = list(df.index[6:])
    print('Question{}, Type: Plurality'.format(q))
    print(selections)
    
    for i in selections:
        print('"' +question_mapping['question{}'.format(q)]['answers']['{}'.format(i)] + '",')
        
    print
    
    
In [29]:
    
qualitative_feedback = np.array([
    [-1, 0, 0],
    [-1, 1, 1],
    [-1, 1, 0],
    [ 0, 0, 1],
    [ 0, 1, 0],
    [ 0, 1,-1],
    [-1, 0, 1],
    [ 0, 1,-1],
    [ 1, 0, 0],
    [-1, 1, 0],
    [ 1, 1, 0],
    [-1, 1, 1],
    [-1, 0, 1],
    [ 1, 1,-1],
    [ 1, 0, 0],
    [ 0, 1, 0],
    [-1, 0, 0]
])
dislike = []
neutral = []
like    = []
for doc in qualitative_feedback:
    
    for i, rating in enumerate(doc):
        
        if rating == -1:
            dislike.append(i+1)
        elif rating == 0:
             neutral.append(i+1)
        else:
             like.append(i+1)
    
In [30]:
    
fig, ax = plt.subplots(1,1, figsize=(5,4))
ax.hist([dislike,like,neutral] , label=['Dislike', 'Neutral', 'Like'], color=[colours[0], colours[1], colours[2]], alpha=0.8)
# ax.set_title('Qualitative Feedback', fontdict={'size': 22})
ax.set_xticks([1.1,2.1,2.9])
ax.set_xticklabels(['Ranking', 'Cardinal Score', 'Plurality'], fontdict={'size': 16})
ax.legend(loc='center right', bbox_to_anchor=(0.55, 0.5), fontsize=16)
plt.show()
    
    
In [31]:
    
data_df
    
    Out[31]:
In [34]:
    
result = []
for i in range(10000):
    
    row = np.random.choice(np.arange(1,11), size=10)
    res = dict()
    dist = []
    for row in [np.random.choice(np.arange(1,11), size=10, replace=False) for i in range(37)]:
        for window in np.concatenate([np.arange(0,3), np.arange(5,8)]):
            top3 = row[window:window+3]
        #     print(top3)
            for perm in itertools.combinations(top3, 2):
                if (tuple(perm) in res):
                    res[tuple(perm)] += 1
                elif tuple([perm[1], perm[0]]) in res:
                    res[tuple([perm[1], perm[0]])] += 1
                else:
                    res[tuple(perm)] = 1
                dist.append(np.abs(perm[0] - perm[1]))
    result.append(dist)
# plt.hist(dist, bins=8)
# plt.xlim([0,9])
# # plt.ylim([0,9])
    
In [35]:
    
# for i in result:
import collections
counts = []
for i in result:
    counts.append([c for i,c in collections.Counter(i).iteritems()])
    
In [36]:
    
means = np.array(counts).mean(axis=0)
stds = np.array(counts).std(axis=0)
    
In [57]:
    
import itertools
count_of_ones = []
fig, axes = plt.subplots(1,3,figsize=(12,4))
for k, (ax, question) in enumerate(zip(axes, ['question1','question2','question3'])):
    
    res = dict()
    dist = []
#     question = 'question1'
    for ix, group in data_df[question].groupby('qustion_type'):
#         print(group.shape)
        gp = group[[1,2,3,4,5,6,7,8,9,10]]
        for i, row in gp.iterrows():
#             print(row.sort_values())
            for window in np.concatenate([np.arange(0,3), np.arange(5,8)]):
                top3 = row.sort_values().index[window:window+3]
#                 print(top3)
                for perm in itertools.combinations(top3, 2):
                    if (tuple(perm) in res):
                        res[tuple(perm)] += 1
                    elif tuple([perm[1], perm[0]]) in res:
                        res[tuple([perm[1], perm[0]])] += 1
                    else:
                        res[tuple(perm)] = 1
                    dist.append(np.abs(perm[0] - perm[1]))
                    
    count_of_ones.append(np.sum(np.array(dist) == 1))
    ax.hist(dist, bins=9, label='Observed Distance')
    ax.set_title('Topic {}'.format(k+1))
    ax.set_xlabel('distance')
    if k == 0:
        ax.set_ylabel('frequency')
        
    ax.bar(np.arange(1,10),means, yerr=stds, color=colours[1], alpha=0.5, width=1, label='Expected Distance')
    
ax.legend(loc='center right', bbox_to_anchor=(2, 0.5), fontsize=16)
fig.tight_layout()
plt.show()
    
    
In [58]:
    
result = np.array(result)
    
In [59]:
    
ones = []
for r in result:
    ones.append(np.sum(r == 1))
    
In [60]:
    
plt.hist(ones)
plt.show()
    
    
In [61]:
    
mean = np.mean(ones)
std = np.std(ones)
    
In [62]:
    
mean
    
    Out[62]:
In [67]:
    
z_scores = (count_of_ones-mean)/std
    
In [68]:
    
count_of_ones
    
    Out[68]:
In [69]:
    
p_values = stats.norm.sf(np.abs(z_scores))
p_values
    
    Out[69]:
In [70]:
    
2.70081974e-03
    
    Out[70]:
In [ ]: