In [1]:
%run "../Functions/4. User comparison.ipynb"
In [2]:
# Rename columns of the Google Forms table with tags independantly of form language
columnTags = ["timestamp", "gameInterest", "gameFrequency", "age", "gender", "biologyStudy", "biologyInterest", "synthBioKnowledge", "biobrickKnowledge", "previousVersion", "previousPlay", "arcadePlay", "androidPlay", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27", "comments", "anonymousID", "lang", "temporality"]
columnQuestions = gform.columns.values.tolist()
googleData = gform.rename(columns=dict(zip(columnQuestions, columnTags)))
#googleData.head()
In [3]:
# Replaces answers to scientific questions in the questionnaires by their values (True or False)
correctedData = googleData.copy()
for rowId in range(correctedData.shape[0]):
# Get the correction for each subject
playerId = correctedData.loc[rowId, "anonymousID"]
correction = getCorrections(playerId)
if correction.shape[1] > 0:
# If subject has answered questionnaire
correction = correction.rename(index=dict(zip(columnQuestions, columnTags)))
# Replace scientific answers by their correction
for questionId in range(27):
questionTag = "Q" + str(questionId + 1)
correctedData.loc[rowId, questionTag] = int(correction.loc[questionTag, "corrections" + str(rowId)])
#correctedData.tail(15)
In [4]:
list(zip(columnQuestions, columnTags))
Out[4]:
In [5]:
# Get only answers to scientific questions
correctedScientific = correctedData.loc[:, "Q1":"Q27"]
#correctedScientific.head()
In [6]:
# Remove timestamp and comments features
codedData = googleData.copy().drop(['timestamp', "comments"], axis=1)
codedData.head()
Out[6]:
In [7]:
# Code answers with integers when possible
# Define equivalences
# gameInterest
gameInterestCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# gameFrequency
gameFrequencyCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Rarely": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# biologyStudy
biologyStudyCoding = {"Not even in middle school": 0, "Jamais": 0, "Jamais, pas même au collège": 0, "Until the end of middle school": 1, "Jusqu'au brevet": 1, "Until the end of high school": 2, "Jusqu'au bac": 2, "Until bachelor's degree": 3, "Jusqu'à la license": 3, "At least until master's degree": 4, "Au moins jusqu'au master": 4, "I don't know": 0, "Je ne sais pas": 0}
# biologyInterest
biologyInterestCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# synthBioKnowledge
# biobrickKnowledge
previousKnowledgeCoding = {"Yes": 1, "No": 0, "I don't know": 0, "Oui": 1, 'Non': 0, "Je ne sais pas": 0}
# previousVersion
# previousPlay
# arcadePlay
# androidPlay
previousPlayCoding = {"Multiple times": 3, "A few times": 2, "Once": 1, "Yes": 1, "No": 0, "I don't know": 0, "De nombreuses fois": 3, "Quelques fois": 2, "Une fois": 1, "Oui": 1, "Non": 0, "Je ne sais pas": 0}
# lang
languageCoding = {"en": 0, "fr": 1}
# temporality
temporalityCoding = {"before": 0, "after": 1, "undefined": -5}
# Fill NaN cells
codedData["biobrickKnowledge"].fillna("I don't know", inplace = True)
codedData["arcadePlay"].fillna("I don't know", inplace = True)
codedData["androidPlay"].fillna("I don't know", inplace = True)
codedData["previousPlay"].fillna("I don't know", inplace = True)
# Replace by code
for rowId in range(codedData.shape[0]):
codedData.loc[rowId, "gameInterest"] = gameInterestCoding[codedData.loc[rowId, "gameInterest"]]
codedData.loc[rowId, "gameFrequency"] = gameFrequencyCoding[codedData.loc[rowId, "gameFrequency"]]
codedData.loc[rowId, "biologyStudy"] = biologyStudyCoding[codedData.loc[rowId, "biologyStudy"]]
codedData.loc[rowId, "biologyInterest"] = biologyInterestCoding[codedData.loc[rowId, "biologyInterest"]]
codedData.loc[rowId, "synthBioKnowledge"] = previousKnowledgeCoding[codedData.loc[rowId, "synthBioKnowledge"]]
codedData.loc[rowId, "biobrickKnowledge"] = previousKnowledgeCoding[codedData.loc[rowId, "biobrickKnowledge"]]
codedData.loc[rowId, "previousVersion"] = previousPlayCoding[codedData.loc[rowId, "previousVersion"]]
codedData.loc[rowId, "previousPlay"] = previousPlayCoding[codedData.loc[rowId, "previousPlay"]]
codedData.loc[rowId, "arcadePlay"] = previousPlayCoding[codedData.loc[rowId, "arcadePlay"]]
codedData.loc[rowId, "androidPlay"] = previousPlayCoding[codedData.loc[rowId, "androidPlay"]]
codedData.loc[rowId, "lang"] = languageCoding[codedData.loc[rowId, "lang"]]
codedData.loc[rowId, "temporality"] = temporalityCoding[codedData.loc[rowId, "temporality"]]
In [8]:
# One-Hot version
codedData = pd.get_dummies(codedData, prefix = ["gender", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27"], columns = ["gender", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27"])
#codedData.head()
In [9]:
# Split the forms according to temporality
beforeForms = codedData.copy().loc[codedData["temporality"] == 0,:]
afterForms = codedData.copy().loc[codedData["temporality"] == 1,:]
undefForms = codedData.copy().loc[codedData["temporality"] == -5,:]
defForms = codedData.copy().loc[codedData["temporality"] >= 0,:] # Either before or after
# For subjects with both before and after forms, join the two
beforeAndAfterForms = pd.merge(beforeForms, afterForms, on="anonymousID", suffixes=('_before', '_after'))
#beforeForms.head()
#afterForms.head()
#undefForms.head()
#defForms.head()
#beforeAndAfterForms.head()
In [10]:
# Remove ID feature
allForms = codedData.copy().drop("anonymousID", axis=1)
beforeForms.drop("anonymousID", axis=1, inplace = True)
afterForms.drop("anonymousID", axis=1, inplace = True)
undefForms.drop("anonymousID", axis=1, inplace = True)
defForms.drop("anonymousID", axis=1, inplace = True)
beforeAndAfterForms.drop("anonymousID", axis=1, inplace = True)
In [11]:
# Remove timestamp and comments features
codedCorrectedData = correctedData.copy().drop(['timestamp', "comments"], axis=1)
In [12]:
# Fill NaN cells
codedCorrectedData["biobrickKnowledge"].fillna("I don't know", inplace = True)
codedCorrectedData["arcadePlay"].fillna("I don't know", inplace = True)
codedCorrectedData["androidPlay"].fillna("I don't know", inplace = True)
codedCorrectedData["previousPlay"].fillna("I don't know", inplace = True)
# Replace by code
for rowId in range(codedData.shape[0]):
codedCorrectedData.loc[rowId, "gameInterest"] = gameInterestCoding[codedCorrectedData.loc[rowId, "gameInterest"]]
codedCorrectedData.loc[rowId, "gameFrequency"] = gameFrequencyCoding[codedCorrectedData.loc[rowId, "gameFrequency"]]
codedCorrectedData.loc[rowId, "biologyStudy"] = biologyStudyCoding[codedCorrectedData.loc[rowId, "biologyStudy"]]
codedCorrectedData.loc[rowId, "biologyInterest"] = biologyInterestCoding[codedCorrectedData.loc[rowId, "biologyInterest"]]
codedCorrectedData.loc[rowId, "synthBioKnowledge"] = previousKnowledgeCoding[codedCorrectedData.loc[rowId, "synthBioKnowledge"]]
codedCorrectedData.loc[rowId, "biobrickKnowledge"] = previousKnowledgeCoding[codedCorrectedData.loc[rowId, "biobrickKnowledge"]]
codedCorrectedData.loc[rowId, "previousVersion"] = previousPlayCoding[codedCorrectedData.loc[rowId, "previousVersion"]]
codedCorrectedData.loc[rowId, "previousPlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "previousPlay"]]
codedCorrectedData.loc[rowId, "arcadePlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "arcadePlay"]]
codedCorrectedData.loc[rowId, "androidPlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "androidPlay"]]
codedCorrectedData.loc[rowId, "lang"] = languageCoding[codedCorrectedData.loc[rowId, "lang"]]
codedCorrectedData.loc[rowId, "temporality"] = temporalityCoding[codedCorrectedData.loc[rowId, "temporality"]]
In [13]:
# One-Hot versions of the above dataframes
codedCorrectedData = pd.get_dummies(codedCorrectedData, prefix = ["gender"], columns = ["gender"])
codedCorrectedData.head()
Out[13]:
In [14]:
# Split the forms according to temporality
beforeCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 0,:]
afterCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 1,:]
undefCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == -5,:]
defCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] >= 0,:] # Either before or after
# For subjects with both before and after forms, join the two
beforeAndAfterCorrectedForms = pd.merge(beforeCorrectedForms, afterCorrectedForms, on="anonymousID", suffixes=('_before', '_after'))
#beforeCorrectedForms.head()
#afterCorrectedForms.head()
#undefCorrectedForms.head()
#defCorrectedForms.head()
#beforeAndAfterCorrectedForms.head()
In [15]:
# Remove ID feature
allCorrectedForms = codedCorrectedData.copy().drop("anonymousID", axis=1)
beforeCorrectedForms.drop("anonymousID", axis=1, inplace = True)
afterCorrectedForms.drop("anonymousID", axis=1, inplace = True)
undefCorrectedForms.drop("anonymousID", axis=1, inplace = True)
defCorrectedForms.drop("anonymousID", axis=1, inplace = True)
beforeAndAfterCorrectedForms.drop("anonymousID", axis=1, inplace = True)
In [16]:
# Use defForms and defCorrectedForms for coded data
In [17]:
# Fetch RedMetrics data for subjects which answered the gform
allData = getAllUserVectorData(getAllResponders(), _source=[])
In [18]:
# Put subjects as rows and features as columns
allData = allData.transpose()
allData.head()
Out[18]:
In [19]:
# Drop useless columns in which all rows have the same value
allData.drop(["switch", "gotomooc"], axis=1, inplace = True)
# Fill NaN with negatives
allData.fillna(-1, inplace = True)
In [20]:
# Get the subset of subjects who have answered before and after
fullProcessData = allData[allData["scorebefore"] >= 0]
fullProcessData = fullProcessData[fullProcessData["scoreafter"] >= 0]
#fullProcessData.loc["01e85778-2903-447b-bbab-dd750564ee2d",:]
In [21]:
#Get the subset of subjects who have answered the after questionnaire
withAfterData = allData.copy()[allData["scoreafter"] >= 0]
withAfterData['anonymousID'] = withAfterData.index
withAfterData['anonymousID'] = withAfterData['anonymousID'].apply(lambda x: x.strip('"'))
# Join with questionnaire data with complete answers
afterNotCorrected = codedData.copy().loc[codedData["temporality"] == 1,:]
gameAndAfterData = pd.merge(withAfterData, afterNotCorrected, on="anonymousID")
gameAndAfterData.drop("anonymousID", axis=1, inplace = True)
#Join with questionnaire data with corrected answers
afterCorrected = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 1,:]
gameAndCorrectedAfterData = pd.merge(withAfterData, afterCorrected, on="anonymousID")
gameAndCorrectedAfterData.drop("anonymousID", axis=1, inplace = True)
#gameAndAfterData.head()
#gameAndCorrectedAfterData.head()
In [22]:
allDataClassif = allData.copy()
allDataClassif['anonymousID'] = allData.index
allDataClassif['anonymousID'] = allDataClassif['anonymousID'].apply(lambda x: x.strip('"'))
# If checkpoint not reached, set time to 3600 (1h)
def floorCheckpoints(value):
if value > 3600:
return 3600
return value
for col in allDataClassif.columns.values.tolist():
if isinstance(col, np.int64):
allDataClassif[col] = allDataClassif[col].apply(floorCheckpoints)
allDataClassif["completionTime"] = allDataClassif["completionTime"].apply(lambda x: min(x, 7200))
# floor thoroughness
allDataClassif["thoroughness"] = allDataClassif["thoroughness"].apply(lambda x: min(x, 1000))
In [23]:
#Join with questionnaire data with corrected answers
gameAndCorrectedAfterDataClassif = pd.merge(withAfterData, afterCorrected, on="anonymousID")
gameAndCorrectedAfterDataClassif['anonymousID'] = gameAndCorrectedAfterDataClassif['anonymousID'].apply(lambda x: x.strip('"'))
gameAndCorrectedAfterDataClassif.drop(["scorebefore", "scoreundefined", "temporality"], axis=1, inplace = True)
for col in gameAndCorrectedAfterDataClassif.columns.values.tolist():
if isinstance(col, np.int64):
gameAndCorrectedAfterDataClassif[col] = gameAndCorrectedAfterDataClassif[col].apply(floorCheckpoints)
gameAndCorrectedAfterDataClassif["completionTime"] = gameAndCorrectedAfterDataClassif["completionTime"].apply(lambda x: min(x, 7200))
# floor thoroughness
gameAndCorrectedAfterDataClassif["thoroughness"] = gameAndCorrectedAfterDataClassif["thoroughness"].apply(lambda x: min(x, 1000))
In [24]:
#Get the subset of subjects who have answered the before questionnaire
withBeforeData = allData.copy()[allData["scorebefore"] >= 0]
withBeforeData['anonymousID'] = withBeforeData.index
withBeforeData['anonymousID'] = withBeforeData['anonymousID'].apply(lambda x: x.strip('"'))
beforeCorrected = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 0,:]
#Join with questionnaire data with corrected answers
gameAndCorrectedBeforeDataClassif = pd.merge(withBeforeData, beforeCorrected, on="anonymousID")
gameAndCorrectedBeforeDataClassif['anonymousID'] = gameAndCorrectedBeforeDataClassif['anonymousID'].apply(lambda x: x.strip('"'))
gameAndCorrectedBeforeDataClassif.drop(["scoreafter", "scoreundefined", "temporality"], axis=1, inplace = True)
# If checkpoint not reached, set time to 3600 (1h)
for col in gameAndCorrectedBeforeDataClassif.columns.values.tolist():
if isinstance(col, np.int64):
gameAndCorrectedBeforeDataClassif[col] = gameAndCorrectedBeforeDataClassif[col].apply(floorCheckpoints)
gameAndCorrectedBeforeDataClassif["completionTime"] = gameAndCorrectedBeforeDataClassif["completionTime"].apply(lambda x: min(x, 7200))
# floor thoroughness
gameAndCorrectedBeforeDataClassif["thoroughness"] = gameAndCorrectedBeforeDataClassif["thoroughness"].apply(lambda x: min(x, 1000))
In [ ]:
In [ ]:
In [ ]: