In [51]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import copy
import pandas as pd
import numpy as np
import datetime
In [52]:
d = pd.read_csv('../../data/raw_responses.tsv', sep = '\t')
In [53]:
d = d[d['Timestamp'] > '2/29/2016 16:17:00']
In [54]:
d.head()
Out[54]:
In [55]:
d = d.fillna('no_response')
In [56]:
d_q = {'Timestamp': 'submit_timestamp',
'I am reading this article to': 'raw_information_depth',
'Prior to visiting this article':'raw_prior_knowledge',
'I am reading this article because' : 'raw_motivation',
'This is you survey ID. Please do not modify.' : 'token'
}
d_a1 = {'look up a specific fact or to get a quick answer.': 'fact',
'get an overview of the topic.': 'overview',
'get an in-depth understanding of the topic.': 'in-depth',
'no_response': 'no response'}
d_a2 = {'I was already familiar with the topic.': 'familiar',
'I was not familiar with the topic and I am learning about it for the first time.': 'unfamiliar',
'no_response': 'no response'}
d_a3 = {'I have a work or school-related assignment.' : 'work/school',
'I need to make a personal decision based on this topic (e.g. to buy a book, choose a travel destination).': 'personal decision',
"I want to know more about a current event (e.g. a soccer game, a recent earthquake, somebody's death).": 'current event',
"the topic was referenced in a piece of media (e.g. TV, radio, article, film, book).": 'media',
"the topic came up in a conversation.": "conversation",
"I am bored or randomly exploring Wikipedia for fun.": 'bored/random',
'no_response': 'no response',
"this topic is important to me and I want to learn more about it. (e.g., to learn about a culture).": 'intrinsic learning'
}
d_a = {'raw_information_depth': d_a1, 'raw_prior_knowledge': d_a2,'raw_motivation': d_a3 }
In [59]:
def reformat_dt(s):
from_pattern = "%m/%d/%Y %H:%M:%S"
to_pattern = "%Y-%m-%d %H:%M:%S"
from_dt = datetime.datetime.strptime (s, from_pattern)
return from_dt.strftime(to_pattern)
def recode_motivation(x):
x = str(x)
for k, v in d_a3.items():
x = x.replace(k,v)
reasons = [e if e in d_a3.values() else 'other' for e in x.split(', ')]
return '|'.join(set(reasons))
def recode_df(d):
d = copy.deepcopy(d)
d.columns = [d_q[c] for c in d.columns]
d['submit_timestamp'] = d['submit_timestamp'].apply(reformat_dt)
d['information depth'] = d['raw_information_depth'].apply(lambda x: d_a['raw_information_depth'].get(x, 'other'))
d['prior knowledge'] = d['raw_prior_knowledge'].apply(lambda x: d_a['raw_prior_knowledge'].get(x, 'other'))
d['motivation'] = d['raw_motivation'].apply(recode_motivation)
return d
In [60]:
dr = recode_df(d)
In [61]:
dr['prior knowledge'].value_counts()
Out[61]:
In [62]:
dr['information depth'].value_counts()
Out[62]:
In [63]:
for i, r in dr[dr['motivation'] == 'other'][:10].iterrows():
print (r['raw_motivation'])
In [64]:
dr['motivation'].value_counts()
Out[64]:
In [65]:
dr = dr.drop_duplicates('token')
In [66]:
dr.to_csv('../../data/responses.tsv', sep = '\t', index = False)
In [ ]: