In [126]:
import json
import numpy as np
import pandas as pd
import datetime
import os
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.stats import kendalltau
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
%matplotlib inline

In [70]:
question_types = {
    1: 'ranking',
    2: 'select5',
    3: 'pointwise_score'
}

json_data=open('../web-selection-form/data/questions.json').read()
question_mapping = json.loads(json_data)

In [71]:
# get the actual questions and answers
print(question_mapping['question1']['answers']['1'])
print(question_mapping['question1']['answers']['10'])


Current admission guidelines for US colleges admissions offices result in the colleges failing to admit many top quality students.
Beyond admissions, colleges should use Data Science as an early warning tool to detect students who need possible assistance.

In [72]:
columns = {
    'question1': dict([[i, np.nan] for i in (range(1,11)+['qustion_type'])]),
    'question2': dict([[i, np.nan] for i in (range(1,11)+['qustion_type'])]),
    'question3': dict([[i, np.nan] for i in (range(1,11)+['qustion_type'])]),
}

columns = ([('question1', i) for i in range(1,11)] 
            +[('question1', 'qustion_type'),('question1', 'end_time')]
            +[('question2', i) for i in range(1,11)]
            +[('question2', 'qustion_type'),('question2', 'end_time')]
            +[('question3', i) for i in range(1,11)]
            +[('question3', 'qustion_type'),('question3', 'end_time')]
            +[('feedback', '')]
            +[('start_time', '')]
            +[('end_time', '')])

columns = pd.MultiIndex.from_tuples(columns)

data_df = pd.DataFrame(columns=columns)
data_df.index.name = 'respondant'
data_df


Out[72]:
question1 ... question3 feedback start_time end_time
1 2 3 4 5 6 7 8 9 10 ... 6 7 8 9 10 qustion_type end_time
respondant

0 rows × 39 columns


In [82]:
data_path = './../responses/'
# Deal with question 1
    
for index, file in enumerate(os.listdir(data_path)):
    
    json_data=open(os.path.join(data_path, file)).read()
    data = json.loads(json_data)

    data_df.set_value(index=index, col=('feedback', ''), value=data['qualitative_feedback'])

    start_time = datetime.datetime.fromtimestamp(data['time_start']/1000.0)
    end_time = datetime.datetime.fromtimestamp(data['time_end']/1000.0)
    
    data_df.set_value(index=index, col=('start_time', ''), value=start_time)
    data_df.set_value(index=index, col=('end_time', ''), value=end_time)

    question_times = [datetime.datetime.fromtimestamp(data['question%i_time'%i]/1000.0) for i in range(1,4)]
    question_times = list(np.sort(question_times + [start_time]))
    
    for question_number in range(1,4):

        question_number_string = 'question%i'%question_number
        question_type = 'question%i_type'%question_number
        question_time = datetime.datetime.fromtimestamp(data['question%i_time'%question_number]/1000.0)
        question_selections = 'question%i_selections'%question_number

        this_q = question_times.index(question_time)
        time_delta = question_times[this_q] - question_times[this_q-1]
        
        data_df.set_value(index=index, col=(question_number_string, 'qustion_type'), value=data[question_type])
        data_df.set_value(index=index, col=(question_number_string, 'end_time'), value=time_delta)

        if data[question_type] == 1: # 'ranking question'

            for i in range(1,11):

                answer = data[question_selections][i-1]
                data_df.set_value(index=index, col=(question_number_string, answer), value=i)

        elif data[question_type] == 2: # 'selecting question'

            for i in range(1,11):

                answer = 10 if i in data[question_selections] else 0
                data_df.set_value(index=index, col=(question_number_string, i), value=answer)

        else: # 'number question'

            answers = []
            for i in range(1,11):

                answers.append(int(data[question_selections][i]))
                
            for i in range(1,11):
                
                data_df.set_value(index=index, col=(question_number_string, i), value=(answers[i-1]/float(np.sum(answers))))

        
data_df = data_df.drop_duplicates()
data_df = data_df.drop([7,17,28,29], axis=0)

In [83]:
data_df['question1'][data_df['question1']['qustion_type'] == 1]


Out[83]:
1 2 3 4 5 6 7 8 9 10 qustion_type end_time
respondant
3 4 6 10 7 9 5 8 3 1 2 1 0:03:20.055000
4 3 2 1 8 7 10 5 4 9 6 1 0:02:46.078000
5 6 4 8 7 5 9 10 3 1 2 1 0:01:35.948000
6 5 7 4 2 10 9 1 8 6 3 1 0:01:24.361000
12 6 3 9 5 4 10 1 2 8 7 1 0:00:06.229000
14 6 1 3 7 8 5 4 10 2 9 1 0:00:42.892000
22 7 4 9 1 3 5 6 8 10 2 1 0:03:09.368000
23 3 1 2 10 6 5 7 4 8 9 1 0:03:22.487000
27 1 9 5 8 4 3 6 2 7 10 1 0:00:58.677000
31 7 3 5 6 8 4 2 10 1 9 1 0:04:19.443000
33 2 4 5 6 7 8 3 9 1 10 1 0:02:57.543000
34 2 5 4 7 6 9 3 8 1 10 1 0:09:01.224000
39 2 4 1 9 8 7 3 6 5 10 1 0:43:13.912000
40 3 4 5 8 7 2 6 10 1 9 1 0:05:24.098000

In [7]:
data_df


Out[7]:
question1 ... question3 feedback start_time end_time
1 2 3 4 5 6 7 8 9 10 ... 6 7 8 9 10 qustion_type end_time
respondant
0 0.0555556 0.0925926 0.0925926 0.166667 0.037037 0.0740741 0.166667 0.12963 0.148148 0.037037 ... 10 0 0 10 10 2 2017-04-27 14:19:40.719000 Ranking was quite challenging, especially give... 2017-04-27 14:17:23.001000 2017-04-27 14:27:09.070000
1 0.15625 0.15625 0.15625 0.015625 0.015625 0.015625 0.15625 0.15625 0.15625 0.015625 ... 6 3 2 5 7 1 2017-04-27 14:51:36.537000 2017-04-27 14:40:32.115000 2017-04-27 14:59:16.515000
2 10 10 10 0 0 0 0 0 10 10 ... 0.160714 0.0714286 0.0357143 0.0178571 0.107143 3 2017-04-27 14:57:55.678000 ranking is difficult, scoring and/or selecting... 2017-04-27 14:44:51.384000 2017-04-27 15:00:08.110000
3 4 6 10 7 9 5 8 3 1 2 ... 0.176471 0.0196078 0.196078 0.0196078 0.156863 3 2017-04-27 14:55:39.893000 Ticking the 5 most relevant points to support ... 2017-04-27 14:46:53.279000 2017-04-27 15:00:30.966000
4 3 2 1 8 7 10 5 4 9 6 ... 0.0526316 0.0526316 0.526316 0.0526316 0.0526316 3 2017-04-27 15:19:58.372000 2017-04-27 15:07:22.730000 2017-04-27 15:20:44.715000
5 6 4 8 7 5 9 10 3 1 2 ... 0 10 10 0 10 2 2017-04-27 15:52:55.708000 2017-04-27 15:47:02.165000 2017-04-27 15:54:57.614000
6 5 7 4 2 10 9 1 8 6 3 ... 10 10 0 10 0 2 2017-04-27 17:53:33.511000 I liked the first metric (separate scores) mos... 2017-04-27 17:49:19.700000 2017-04-27 17:58:54.911000
7 5 7 4 2 10 9 1 8 6 3 ... 10 10 0 10 0 2 2017-04-27 17:53:33.511000 I liked the first metric (separate scores) mos... 2017-04-27 17:49:19.700000 2017-04-27 17:58:58.317000
8 0.241379 0.103448 0.0344828 0.172414 0.103448 0.0344828 0.0344828 0.172414 0.0344828 0.0689655 ... 0 0 0 0 10 2 2017-04-27 18:44:09.104000 2017-04-27 17:40:01.636000 2017-04-27 18:44:42.758000
9 0.132075 0.150943 0.0377358 0.0188679 0.113208 0.132075 0.188679 0.0754717 0.132075 0.0188679 ... 6 5 9 10 8 1 2017-04-27 19:35:38.131000 Selection was easiest to complete, but I think... 2017-04-27 19:09:53.142000 2017-04-27 19:36:42.114000
10 0.109091 0.0909091 0.0909091 0.0909091 0.109091 0.0909091 0.0909091 0.145455 0.0909091 0.0909091 ... 10 0 10 0 10 2 2017-04-27 20:23:15.051000 2017-04-27 20:20:16.054000 2017-04-27 20:23:20.520000
11 0.0882353 0.205882 0.147059 0.0294118 0.0294118 0.0588235 0.147059 0.0294118 0.205882 0.0588235 ... 1 6 9 2 4 1 2017-04-27 20:24:15.779000 2017-04-27 20:13:55.582000 2017-04-27 20:24:19.487000
12 6 3 9 5 4 10 1 2 8 7 ... 10 0 0 10 10 2 2017-04-27 20:46:13.278000 2017-04-27 20:45:19.272000 2017-04-27 20:48:55.049000
13 0.078125 0.109375 0.09375 0.09375 0.125 0.0625 0.078125 0.125 0.09375 0.140625 ... 2 3 10 6 8 1 2017-04-27 21:03:17.482000 Giving a score of 1 to 10 was the most difficu... 2017-04-27 20:51:08.794000 2017-04-27 21:06:33.623000
14 6 1 3 7 8 5 4 10 2 9 ... 0.157895 0.0175439 0.0175439 0.157895 0.0877193 3 2017-04-27 23:48:18.278000 2017-04-27 23:37:29.434000 2017-04-27 23:49:04.291000
15 10 10 0 0 10 10 0 0 10 0 ... 0.196078 0.0392157 0.0980392 0.176471 0.0980392 3 2017-04-27 23:51:30.535000 Ranking is the most difficult. I prefer the f... 2017-04-27 23:30:32.928000 2017-04-27 23:53:12.786000
16 10 0 0 0 10 0 0 10 10 10 ... 0.0958904 0.136986 0.0684932 0.0821918 0.0821918 3 2017-04-28 01:25:09.923000 Assigning values from 1-10 was the most diffic... 2017-04-28 01:21:43.367000 2017-04-28 01:28:29.475000
17 0 0 0 0 10 10 10 10 10 0 ... 0.163636 0.0909091 0.0181818 0.109091 0.0545455 3 2017-04-28 08:27:21.046000 2017-04-28 08:23:26.424000 2017-04-28 08:38:31.676000
18 0 0 0 0 10 10 10 10 10 0 ... 0.163636 0.0909091 0.0181818 0.109091 0.0545455 3 2017-04-28 08:27:21.046000 2017-04-28 08:23:26.424000 2017-04-28 08:39:33.224000
19 0.117647 0.0235294 0.0823529 0.117647 0.0941176 0.117647 0.105882 0.117647 0.105882 0.117647 ... 10 9 5 1 8 1 2017-04-28 08:44:12.605000 2017-04-28 08:34:39.694000 2017-04-28 08:44:19.413000
20 0.115385 0.0769231 0.0897436 0.102564 0.102564 0.0641026 0.0641026 0.128205 0.128205 0.128205 ... 7 6 4 5 1 1 2017-04-28 09:00:06.135000 I think ranking was the easiest format in this... 2017-04-28 08:54:13.865000 2017-04-28 09:00:56.242000
21 0.125 0.111111 0.0972222 0.111111 0.0694444 0.0972222 0.111111 0.0694444 0.0833333 0.125 ... 3 7 9 8 6 1 2017-04-28 09:01:55.816000 The checkbox question was the easiest to compl... 2017-04-28 08:55:47.607000 2017-04-28 09:04:06.216000
22 7 4 9 1 3 5 6 8 10 2 ... 10 10 10 0 10 2 2017-04-28 09:28:24.249000 2/3rds of everything is irrelevant. What matte... 2017-04-28 09:26:14.238000 2017-04-28 09:36:20.315000
23 3 1 2 10 6 5 7 4 8 9 ... 10 10 0 0 0 2 2017-04-28 09:52:32.681000 Question 1's ranking scheme was the easiest as... 2017-04-28 09:44:08.985000 2017-04-28 09:54:05.696000
24 10 10 10 0 0 0 10 0 10 0 ... 1 5 9 7 4 1 2017-04-28 10:30:06.278000 Ranking was more difficult than selection.\nTh... 2017-04-28 10:16:32.505000 2017-04-28 10:34:30.335000
25 0.0357143 0.160714 0.125 0.125 0.0892857 0.0357143 0.125 0.142857 0.107143 0.0535714 ... 10 0 0 0 10 2 2017-04-28 10:41:11.478000 Ranking was more difficult. Particularly when ... 2017-04-28 10:31:23.779000 2017-04-28 10:47:10.506000
26 10 10 10 0 0 0 10 0 10 0 ... 0.188679 0.113208 0.0188679 0.0566038 0.0754717 3 2017-04-28 10:54:23.681000 Was confusing that the first two questions wen... 2017-04-28 10:48:33.597000 2017-04-28 10:54:47.054000
27 1 9 5 8 4 3 6 2 7 10 ... 0.0714286 0.0952381 0.214286 0.0238095 0.119048 3 2017-04-28 11:59:03.843000 2017-04-28 11:54:20.225000 2017-04-28 12:00:05.339000
28 1 9 5 8 4 3 6 2 7 10 ... 0.0714286 0.0952381 0.214286 0.0238095 0.119048 3 2017-04-28 11:59:03.843000 2017-04-28 11:54:20.225000 2017-04-28 12:00:14.519000
29 1 9 5 8 4 3 6 2 7 10 ... 0.0714286 0.0952381 0.214286 0.0238095 0.119048 3 2017-04-28 11:59:03.843000 2017-04-28 11:54:20.225000 2017-04-28 12:00:15.731000
30 0.0862069 0.0862069 0.137931 0.0172414 0.137931 0.0689655 0.0862069 0.137931 0.137931 0.103448 ... 10 0 0 10 10 2 2017-04-28 12:14:02.721000 I thought ranking the options (Question 3) was... 2017-04-28 12:01:02.944000 2017-04-28 12:28:23.361000
31 7 3 5 6 8 4 2 10 1 9 ... 0.14 0.08 0.04 0.12 0.1 3 2017-04-28 16:35:47.740000 the drag-and-drop method facilitates focus on ... 2017-04-28 16:25:39.019000 2017-04-28 16:40:21.713000
32 0.106383 0.212766 0.212766 0.0851064 0.0425532 0.0638298 0.0851064 0.0425532 0.106383 0.0425532 ... 10 0 0 10 10 2 2017-04-28 16:39:06.212000 Several theorems in game theory and social sci... 2017-04-28 16:28:03.486000 2017-04-28 17:00:47.559000
33 2 4 5 6 7 8 3 9 1 10 ... 0.147059 0.132353 0.0735294 0.147059 0.0441176 3 2017-04-28 17:52:48.367000 2017-04-28 17:46:38.305000 2017-04-28 17:52:54.337000
34 2 5 4 7 6 9 3 8 1 10 ... 0.0545455 0.145455 0.127273 0.0181818 0.163636 3 2017-04-28 20:57:52.151000 I liked the voting format that automatically r... 2017-04-28 20:35:57.810000 2017-04-28 20:57:54.747000
35 0.0555556 0.166667 0.222222 0.0555556 0.0555556 0.111111 0.0555556 0.0555556 0.0555556 0.166667 ... 1 5 8 4 6 1 2017-04-28 23:09:34.021000 Drag drop provides the most feedback -- you ca... 2017-04-28 22:57:09.282000 2017-04-28 23:10:47.643000
36 0.120482 0.0843373 0.060241 0.060241 0.120482 0.0963855 0.120482 0.120482 0.120482 0.0963855 ... 2 4 7 1 5 1 2017-04-29 01:37:20.243000 I found that question 1 required a fair amount... 2017-04-29 01:24:46.422000 2017-04-29 01:43:55.095000
37 0.142857 0.128571 0.0714286 0.0571429 0.0857143 0.114286 0.114286 0.114286 0.128571 0.0428571 ... 10 10 0 10 0 2 2017-04-29 02:08:16.790000 I thought tanking was a bit more difficult. Tr... 2017-04-29 01:50:05.323000 2017-04-29 02:12:47.818000
38 0 0 0 0 10 10 10 0 10 10 ... 8 9 1 7 4 1 2017-04-29 03:49:08.157000 2017-04-29 03:41:16.234000 2017-04-29 03:51:01.789000

39 rows × 39 columns


In [8]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 1')
print(question_mapping['question1']['question_description'])
for i, ax in enumerate(axes):
    
    agg_num = data_df['question1'][data_df['question1']['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
    
    q_num = agg_num.index
    data = agg_num.values
    ax.bar(q_num, data)
    ax.set_title(question_types[i+1])


Question 1
<i>Many people feel that USA college admissions are highly skewed toward outdated metrics. Recent research has uncovered metrics that are better able to predict students' future performance in college. However, colleges are reluctant to adopt these new metrics.</i><br/><br/>Imagine that you are in a debate and you are asked to support the argument that USA college admissions should change their admission metrics.

In [9]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 2')
print(question_mapping['question2']['question_description'])
for i, ax in enumerate(axes):
    
    agg_num = data_df['question2'][data_df['question1']['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
    
    q_num = agg_num.index
    data = agg_num.values
    ax.bar(q_num, data)
    ax.set_title(question_types[i+1])


Question 2
<i>The 'get to know you' styled interview is becoming a popular trend. However, research shows that humans are surprisingly bad at inferring essential qualities about an applicant's character and thus may make poor judgments about the future success of an applicant.</i><br/><br/> Imagine that you are in a debate and that you are asked to argue in support of the motion above, that the current structure of interviews is not helpful in successful hiring practices.

In [10]:
fig, axes = plt.subplots(1,3, figsize=(15,3))
print('Question 3')
print(question_mapping['question3']['question_description'])
for i, ax in enumerate(axes):
    
    agg_num = data_df['question3'][data_df['question1']['qustion_type'] == i + 1][[1,2,3,4,5,6,7,8,9,10]].sum()
    
    q_num = agg_num.index
    data = agg_num.values
    ax.bar(q_num, data)
    ax.set_title(question_types[i+1])


Question 3
<i>The World Health Organization (WHO) has proposed to have video game addiction included in its catalog of mental diseases, but rhetoric stating that video games are addictive and comparing them to drugs is misguided.</i><br/><br/> Imagine that you are in a debate and that you are asked to support the statement that video games should not be classified as addictive.

In [11]:
print(question_mapping['question1']['answers']['6'])
print(question_mapping['question1']['answers']['9'])
print(question_mapping['question1']['answers']['8'])
print
print(question_mapping['question2']['answers']['3'])
print(question_mapping['question2']['answers']['5'])
print(question_mapping['question2']['answers']['1'])
print
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['6'])
print(question_mapping['question3']['answers']['10'])


Improved admissions, resulting in improved graduation rates, will show in world rankings four years after initial implementation.
Currently colleges are not taking advantage of advanced statistical methods that can be used to predict student success.
Admissions officers should be more engaged in the long term goals of the university rather than optimizing for world rankings.

Unstructured, 'get-to-know' interviews are becoming popular in the workspace and in college admissions, yet these form a poor metric for predicting the future job performance of the interviewee.
Students are better at predicting other students' GPA scores when no interview is conducted.
Circumstance may be highly misleading in an interview scenario as an interviewee's demeanor may depend highly on external, hidden circumstances.

The American Journal of Psychiatry has published a study showing that the mental and social heath of the purported video game addicts is no different from individuals who are not addicted to video games.
The American Journal of Psychiatry has published a study showing that the mental and social heath of the purported video game addicts is no different from individuals who are not addicted to video games.
Using video gaming to relax does not constitute an addiction in much the same way as watching sports is not addictive.

View the qualitative feedback:


In [16]:
for i, f in data_df['feedback'].iteritems():
    
    print(f)
    print


Ranking was quite challenging, especially given that several statements were highly similar or conceptually related.
I think it would be helpful to have some very stupid arguments thrown in so that you can have greater variability in your measurement.
Glad to see you taking an interest in social psychology, little sister!



ranking is difficult, scoring and/or selecting is easier

Ticking the 5 most relevant points to support my argument is the easiest form of feedback. Placing the different points in order is the most difficult and requires the most time as you have to read through each point multiple times in order to make comparisons and form the list in the order you want. Ranking the points on a scale of 1-10 is also relatively easy, but may not give the most accurate results as I often just end up choosing a random number in the region of important (6-10) or unimportant (which in this case I just left as 1 as I had 5 points already).





I liked the first metric (separate scores) most, because I didn't have to do any artificial ranking or make difficult decisions between N things at once. I could. This is interesting though. One factor that will complicate your analysis is that more of the arguments for the first position were compelling than for the second two (though perhaps I'm biased by the format). So direct quantitative comparison between the formats, which doesn't account for the inherent differences in the distribution of argument persuasivenesses, might be misleading.



I liked the first metric (separate scores) most, because I didn't have to do any artificial ranking or make difficult decisions between N things at once. I could. This is interesting though. One factor that will complicate your analysis is that more of the arguments for the first position were compelling than for the second two (though perhaps I'm biased by the format). So direct quantitative comparison between the formats, which doesn't account for the inherent differences in the distribution of argument persuasivenesses, might be misleading.





Selection was easiest to complete, but I think the rank-ordering will be the most highly informative.







Giving a score of 1 to 10 was the most difficult as it was hard to be fair and determine what I thought deserved a certain number. Choosing the 5 best arguments was fairly easy as I didn't really need to rank the statements. Dragging to rank all the choices was somewhat difficult but visually it was simple and easy because I was able to see my choices in order, rather than just attributing them a number 1 through 10.



Ranking is the most difficult.  I prefer the format that lets me choose on a scale from 1-10 how strong I think the argument is.  

Assigning values from 1-10 was the most difficult voting format.  Simple selection was the easiest, and ranking fell in the middle.  I believe simple selection will produce the most coherent points, but it may be a very small difference between ranking and selection.  Assigning values, while probably being better to analyze statistically, will probably produce the worst bias.







I think ranking was the easiest format in this survey

The checkbox question was the easiest to complete. The ranking question took the longest - I put greater emphasis on the first 5 ranks and cared less about the remaining 5. In the coring question from 1-10 I felt  that my numbers were fairly arbitrary and I did not need to use the full 1-10 scale (only used scores 1,5,6,7,8,9). 

2/3rds of everything is irrelevant. What matters I suppose is being able to justifiably argue your point in a way which will support the evidence that preceded. Furthermore question the obvious and simply do what you can with what you have in the time you have left (desperado)

Question 1's ranking scheme was the easiest as it's easier to visually arrange points. However, Question 2's ranking scheme might produce a better sub-selection of 5 because the focus was on finding only the most relevant points (minimizing cognitive load a bit). 

Ranking was more difficult than selection.
The selection will result in the most coherent sub-selection of 5 points. The constraint of only choosing the 5 most relevant points necessitates the person to do some sort of ranking.
I like the simplicity of selecting the 5 points (the third format presented), but my preferred format of scoring each point from 1 to 10. This allowed me to assign 1 to the points I would never use and also put together a list of the points I would use in my debate. Furthermore, the usable points are now ranked in order, which may be handy when selecting points in a debate.

Ranking was more difficult. Particularly when needing to evaluate several equally poor statements, needing to determine an ordering among those was not as easy as simply assigning a score to each statement. I believe that the scoring mechanism will allow you to see definitive separation between stronger and weaker statements as the gap between their position/score can be demonstrably widened.

Was confusing that the first two questions went in opposite directions - easy to miss







I thought ranking the options (Question 3) was the best approach for comparing arguments. It's easier and faster to compare a given argument with two or one adjacent arguments than it is to judge them in absolute terms. 

I think the formats in Questions 2 and 3 are probably equivalent for selecting the best subset of five arguments. I disliked the format in Question 1 because it requires more work to compare different arguments. 

the drag-and-drop method facilitates focus on comparing two points when I repeatedly ask myself "is this option better than the one above it?" whereas the "select top 5" makes me compare the option in question to up to 5 others (if I've already selected 5) to decide if it deserves to be selected above one of the others; and the "rate each option" voting mechanism forces me to weigh up the relative strength of each option against all the other options in order to balance my scoring. 

Several theorems in game theory and social science [Arrow's, Gibbard–Satterthwaite] state that there's no excellent method of taking preferences from the members of a group and building a set of preferences (or single top choice) that the group would agree with. There might be a loophole around "separate into a top half and a bottom half", but it seems unlikely. Whatever mechanism you decide on will be vulnerable to some particular set of participant votes. That said, it may be possible to find a mechanism that works well for common voting patterns.

It's also not clear why collecting LESS data (ordinal or top-5) would ever be more useful than collecting the full scores, unless perhaps the task of scoring 1-10 is more noisy for reasons of cognitive load and greater breaking of IIA. But certainly whatever math is run on the top-5 data could instead be run on the top 5 of the ranking data. Unless the aim of this research is to show that less taxing question formats are less noisy, I don't see the point.

Overall, I suspect that I'm about to click through and get told I've been lied to and the real purpose of this research is something else altogether. 



I liked the voting format that automatically resorted the choices as they were selected. This made it visually easy to follow the order of 10 items and reorder as needed. I also liked the "choose the best 5" voting format, because I didn't have to select options that I felt were irrelevant, and also because I didn't have to put them in an order and found many choices to be equally as good as another. It was also quick and easy. I disliked the "insert order" voting option because it was cumbersome to go back and re enter numbers and ensure I didn't rank multiple options with the same number. I liked it better when it resorted for me..

Drag drop provides the most feedback -- you can immediately see the other items reordered. 

I found that question 1 required a fair amount of previous understanding of the US college application process and of acronyms used (such as ACT). I wasn't clear on what a few of the statements meant, not being familiar with the US system myself. I found selecting 5 relevant points the easiest selection, probably because I didn't need to have organized my thoughts as much as the first two that required specific ranking or scoring. I therefore believe that the relevant selection will probably result in the most whereby subsection of 5 points :)

I thought tanking was a bit more difficult. Trying to assess the strength of a single sentence set of claims based on a value of 1-10 is arbitrary.
I thought it was a fascinating way to ask questions and fruitful to uncover interesting dynamics in the way that questions are answered. 



function [regret] = set_regret(profile,S,T) % Regret of set S for set T

if isempty(T) regret = 0; return end

[I,J] = find(ismember(profile,S)); [~,ind] = unique(I,'first'); first_indices_S = J(ind);

[I,J] = find(ismember(profile,T)); [~,ind] = unique(I,'first'); first_indices_T = J(ind);

regret = sum((first_indices_T < first_indices_S).*(1./first_indices_T)); end


In [ ]:
def set_regret(profile, S, T):
    '''
    Minimax Regret Set (naive method)
    '''

    if len(T) == 0:
        regret = 0;
        return regret

In [46]:
T = set([2,3,7,11])
S = set([1,5])

In [47]:
len(T) == 0


Out[47]:
False

In [85]:
profile = [list(np.random.permutation([1,2,3,4,5,6,7,8,9,10])) for i in range(10)]

# first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
# first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])

# regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
# regret

In [100]:
import itertools
k = 3
permutations1 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)

In [101]:
output_regret = np.inf
permutations = None
for S in permutations1:
    
    max_reg = 0
    permutation = None
    
    permutations2 = itertools.permutations([1,2,3,4,5,6,7,8,9,10], k)
    for T in permutations2:
        
        first_indices_T = np.array([[i for i,elem in enumerate(x) if x[i] in T][0] + 1 for x in profile])
        first_indices_S = np.array([[i for i,elem in enumerate(x) if x[i] in S][0] + 1 for x in profile])

        regret = np.sum((first_indices_T < first_indices_S)*(1./first_indices_T));
        
        if regret > max_reg:
            
            max_reg = regret
            
    if max_reg < output_regret:

        output_regret = max_reg
        permutations = S

In [102]:
output_regret


Out[102]:
3.8333333333333335

In [103]:
permutations


Out[103]:
(5, 7, 8)

In [15]:
import pickle

In [84]:
q1_sort_profile = data_df['question1'][data_df['question1']['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
with open('./q1_sort_profile.p','wb') as file_:
    pickle.dump(q1_sort_profile, file_)

In [85]:
q2_sort_profile = data_df['question2'][data_df['question2']['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
with open('./q2_sort_profile.p','wb') as file_:
    pickle.dump(q2_sort_profile, file_)

In [86]:
q3_sort_profile = data_df['question3'][data_df['question3']['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
with open('./q3_sort_profile.p','wb') as file_:
    pickle.dump(q3_sort_profile, file_)

In [87]:
def rate_to_rank(question_num):
    rate_profile = data_df['question{}'.format(question_num)][data_df['question{}'.format(question_num)]['qustion_type'] == 3][[1,2,3,4,5,6,7,8,9,10]].values
    rate_list = []
    for i in range(len(rate_profile)):
        unique, counts = np.unique(rate_profile[i], return_counts=True)
    #     print "before", unique, counts
        for k in range(len(counts)):
            if counts[k]>1:
                cntr = 1
                while np.sum(rate_profile[i]==unique[k])>1:
                    indices = np.where(rate_profile[i]==unique[k])
                    idx = np.random.choice(indices[0])
                    rate_profile[i][idx] += cntr*.0000001
                    cntr+=1
        ranking = zip(list(rate_profile[i]), range(10))
    #     print(sorted(ranking))
    #     print([x[1]+1 for x in reversed(sorted(ranking))])
        rate_list.append([x[1]+1 for x in reversed(sorted(ranking))])

    rate_list = np.array(rate_list)
#     print(rate_list)
    with open('./q{}_rate_profile.p'.format(question_num),'wb') as file_:
        pickle.dump(rate_list, file_)

In [88]:
rate_to_rank(1)
rate_to_rank(2)
rate_to_rank(3)

In [95]:
for k in range(1,4):
    for typ in ['rate','sort']:
        with open('./q{}_{}_results.p'.format(k,typ), 'rb') as file_:
            results = pickle.load(file_)
        print("Question {}, Type: {}".format(k,typ))
        print results
        for i in range(len(results)):
            print(question_mapping['question{}'.format(k)]['answers']['{}'.format(results[i])])
        print('\n')
# print(question_mapping['question1']['answers']['6'])
# print(question_mapping['question1']['answers']['9'])
# print(question_mapping['question1']['answers']['8'])
# print
# print(question_mapping['question2']['answers']['3'])
# print(question_mapping['question2']['answers']['5'])
# print(question_mapping['question2']['answers']['1'])
# print
# print(question_mapping['question3']['answers']['6'])
# print(question_mapping['question3']['answers']['6'])
# print(question_mapping['question3']['answers']['10'])


Question 1, Type: rate
[1, 2, 3, 7, 8]
Current admission guidelines for US colleges admissions offices result in the colleges failing to admit many top quality students.
Cumulative high school grade is widely used in the admissions process but grade point average (GPA) in senior years is more predictive of college success than high GPA in junior years.
Average ACT scores are widely used in college admissions but scores for only the Math and English sections are more predictive of student success in college.
There is currently an inherent disconnect between admissions offices and graduation rates (e.g. there is no direct feedback from graduation to the admissions decisions that were made four years prior).
Admissions officers should be more engaged in the long term goals of the university rather than optimizing for world rankings.


Question 1, Type: sort
[1, 2, 3, 6, 7]
Current admission guidelines for US colleges admissions offices result in the colleges failing to admit many top quality students.
Cumulative high school grade is widely used in the admissions process but grade point average (GPA) in senior years is more predictive of college success than high GPA in junior years.
Average ACT scores are widely used in college admissions but scores for only the Math and English sections are more predictive of student success in college.
Improved admissions, resulting in improved graduation rates, will show in world rankings four years after initial implementation.
There is currently an inherent disconnect between admissions offices and graduation rates (e.g. there is no direct feedback from graduation to the admissions decisions that were made four years prior).


Question 2, Type: rate
[1, 4, 7, 8, 10]
Circumstance may be highly misleading in an interview scenario as an interviewee's demeanor may depend highly on external, hidden circumstances.
Interviews can actually be harmful to the hiring process, undercutting the impact of other, more valuable information about interviewees.
When interviewees respond randomly to interview questions, the interviewer has a strong belief that she 'got to know' the interviewee (even though the responses have no bearing on the actual beliefs of the interviewee).
Interviewers naturally turn irrelevant information into a coherent narrative, biasing their conclusions.
Interviews should be used to test job related skills.


Question 2, Type: sort
[2, 4, 6, 7, 10]
An interviewee's conduct may be interpreted differently by different interviewers.
Interviews can actually be harmful to the hiring process, undercutting the impact of other, more valuable information about interviewees.
When interviewees respond randomly to interview questions, the interviewer is unable to detect this trend.
When interviewees respond randomly to interview questions, the interviewer has a strong belief that she 'got to know' the interviewee (even though the responses have no bearing on the actual beliefs of the interviewee).
Interviews should be used to test job related skills.


Question 3, Type: rate
[3, 4, 5, 6, 8]
The fact that a pleasurable activity released dopamine is uninformative, as dopamine is released while playing video games, taking drugs and while partaking in any form of pleasurable activity.
The American Journal of Psychiatry has published a study showing that at most 1 percent of video game players might exhibit characteristics of an addiction.
The American Journal of Psychiatry has published a study showing that gambling is more addictive than video games.
The American Journal of Psychiatry has published a study showing that the mental and social heath of the purported video game addicts is no different from individuals who are not addicted to video games.
We and our children are 'addicted' to new technologies because they improve our lives or are plainly enjoyable to use.


Question 3, Type: sort
[1, 2, 3, 7, 10]
Video gaming is not damaging or disruptive to one's life and thus should not be compared to a drug.
Dopamine levels that are released while playing video games are vastly lower than those released while taking a drug such as methamphetamine.
The fact that a pleasurable activity released dopamine is uninformative, as dopamine is released while playing video games, taking drugs and while partaking in any form of pleasurable activity.
Treating the immoderate playing of video games as an addiction is pathologizing relatively normal behavior.
Using video gaming to relax does not constitute an addiction in much the same way as watching sports is not addictive.



In [120]:
def calc_spearman(question_num):
    rate_profile = data_df['question{}'.format(question_num)][data_df['question{}'.format(question_num)]['qustion_type'] == 3][[1,2,3,4,5,6,7,8,9,10]].values
    rate_list = []
    for i in range(len(rate_profile)):
        unique, counts = np.unique(rate_profile[i], return_counts=True)
    #     print "before", unique, counts
        for k in range(len(counts)):
            if counts[k]>1:
                cntr = 1
                while np.sum(rate_profile[i]==unique[k])>1:
                    indices = np.where(rate_profile[i]==unique[k])
                    idx = np.random.choice(indices[0])
                    rate_profile[i][idx] += cntr*.0000001
                    cntr+=1
        ranking = zip(list(rate_profile[i]), range(10))
    #     print(sorted(ranking))
    #     print([x[1]+1 for x in reversed(sorted(ranking))])
        rate_list.append([x[1]+1 for x in reversed(sorted(ranking))])

    rate_list = np.array(rate_list)
    srs = []
    for i in range(len(rate_list)):
        for j in range(i+1,len(rate_list)):
#             print spearmanr(rate_list[i],rate_list[j])
            srs.append(spearmanr(rate_list[i],rate_list[j]))
    return srs

Rating Spearman Corr Coeff


In [121]:
sr_list_1 = calc_spearman(1)
print(np.mean([sr_list_1[i][0] for i in range(len(sr_list_1))]))
sr_list_2 = calc_spearman(2)
print(np.mean([sr_list_2[i][0] for i in range(len(sr_list_2))]))
sr_list_3 = calc_spearman(3)
print(np.mean([sr_list_3[i][0] for i in range(len(sr_list_3))]))


0.0020202020202
-0.037037037037
-0.00826446280992

Ranking Spearman Corr Coeff


In [127]:
def avg_scc_q(question_num):
    sort_profile = data_df['question{}'.format(question_num)][data_df['question{}'.format(question_num)]['qustion_type'] == 1][[1,2,3,4,5,6,7,8,9,10]].values
    srs = []
    wc = []
    kt = []
    mw = []
    for i in range(len(sort_profile)):
        for j in range(i+1,len(sort_profile)):
    #             print spearmanr(rate_list[i],rate_list[j])
            srs.append(spearmanr(sort_profile[i],sort_profile[j]))
            wc.append(wilcoxon(sort_profile[i],sort_profile[j]))
            kt.append(kendalltau(sort_profile[i],sort_profile[j]))
            mw.append(mannwhitneyu(sort_profile[i],sort_profile[j]))
    print(np.mean([srs[i][0] for i in range(len(srs))]))
    print(np.mean([wc[i][0] for i in range(len(wc))]))
    print(np.mean([kt[i][0] for i in range(len(kt))]))
    print(np.mean([mw[i][0] for i in range(len(mw))]))

In [128]:
avg_scc_q(1)
avg_scc_q(2)
avg_scc_q(3)


/Users/soph/anaconda/envs/py27/lib/python2.7/site-packages/scipy/stats/morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
0.0825174825175
21.0714285714
0.0568986568987
50.0
0.0517906336088
21.2121212121
0.037037037037
50.0
-0.0331680440771
22.0363636364
-0.0311111111111
50.0

In [ ]: