Google form analysis

Analysis of results extracted from Google forms in csv format.

general purpose
sessions and temporalities
score
visualizations
sample getters
checkpoint validation
p(answered question N | answered question P)
Filtering users

Preparation



In [ ]:

    
%run "../Functions/2. Game sessions.ipynb"

Constants



In [ ]:

    
# special user ids
userIDThatDidNotAnswer = '001c95c6-8207-43dc-a51b-adf0c6e005d7'

userID1AnswerEN = '00dbbdca-d86c-4bc9-803c-0602e0153f68'
userIDAnswersEN = '5977184a-1be2-4725-9b48-f2782dc03efb'
userID1ScoreEN = '6b5d392d-b737-49ef-99af-e8c445ff6379'
userIDScoresEN = '5ecf601d-4eac-433e-8056-3a5b9eda0555'

userID1AnswerFR = '2734a37d-4ba5-454f-bf85-1f7b767138f6'
userIDAnswersFR = '01e85778-2903-447b-bbab-dd750564ee2d'
userID1ScoreFR = '3d733347-0313-441a-b77c-3e4046042a53'
userIDScoresFR = '58d22690-8604-41cf-a5b7-d71fb3b9ad5b'

userIDAnswersENFR = 'a7936587-8b71-43b6-9c61-17b2c2b55de3'



In [ ]:

    
#localplayerguidkey = 'Ne pas modifier - identifiant anonyme prérempli'
localplayerguidkey = 'Do not edit -  pre-filled anonymous ID'
localplayerguidindex = gform.columns.get_loc(localplayerguidkey)
localplayerguidindex



In [ ]:

    
firstEvaluationQuestionKey = 'In order to modify the abilities of the bacterium, you have to...'
firstEvaluationQuestionIndex = gform.columns.get_loc(firstEvaluationQuestionKey)
firstEvaluationQuestionIndex



In [ ]:

    
answersColumnNameStem = "answers"
correctionsColumnNameStem = "corrections"

Functions

general purpose



In [ ]:

    
def getUniqueUserCount(sample):
    return sample[localplayerguidkey].nunique()



In [ ]:

    
def getAllResponders( _form = gform ):
    userIds = _form[localplayerguidkey].unique()
    return userIds

def getRandomGFormGUID():
    _uniqueUsers = getAllResponders()
    _userCount = len(_uniqueUsers)
    _guid = '0'
    while (not isGUIDFormat(_guid)):
        _userIndex = randint(0,_userCount-1)
        _guid = _uniqueUsers[_userIndex]
    return _guid

def hasAnswered( userId, _form = gform ):
    return userId in _form[localplayerguidkey].values

def getAnswers( userId, _form = gform ):
    answers = _form[_form[localplayerguidkey]==userId]
    _columnAnswers = answers.T
    
    if 0 != len(answers):
        _newColumns = []
        for column in _columnAnswers.columns:
            _newColumns.append(answersColumnNameStem + str(column))
        _columnAnswers.columns = _newColumns
    else:
        # user has never answered
        print("user " + str(userId) + " has never answered")
        
    return _columnAnswers

sessions and temporalities



In [ ]:

    
def setAnswerTemporalities( _gformDF = gform ):
    # check whether temporalities have already been set
    if(len(_gformDF['Temporality'].unique()) == 1):
        # format : key = _userId, value = [_firstEventDate, 0 or _gformDF.index of before, 0 or _gformDF.index of after]
        temporalities = {}

        for _index in _gformDF.index:
            _userId = _gformDF.loc[_index,localplayerguidkey]
            _firstEventDate, beforeIndex, afterIndex = [0,0,0]

            if _userId in temporalities:
                _firstEventDate, beforeIndex, afterIndex = temporalities[_userId]
            else:
                _firstEventDate = getFirstEventDate(_userId)

            temporality = getTemporality(_gformDF.loc[_index,'Timestamp'],_firstEventDate)

            if temporality == answerTemporalities[0] and beforeIndex != 0 :
                if _gformDF.loc[_index,'Timestamp'] > _gformDF.loc[beforeIndex,'Timestamp']:
                    _gformDF.loc[beforeIndex,'Temporality'] = answerTemporalities[2]
                else:
                    temporality = answerTemporalities[2]
            elif temporality == answerTemporalities[1] and afterIndex != 0 :
                if _gformDF.loc[_index,'Timestamp'] < _gformDF.loc[afterIndex,'Timestamp']:
                    _gformDF.loc[afterIndex,'Temporality'] = answerTemporalities[2]
                else:
                    temporality = answerTemporalities[2]

            _gformDF.loc[_index,'Temporality'] = temporality
            if temporality == answerTemporalities[0]:
                beforeIndex = _index
            elif temporality == answerTemporalities[1]:
                afterIndex = _index

            temporalities[_userId] = [_firstEventDate, beforeIndex, afterIndex]
        print("temporalities set")

# when did the user answer the questionnaire? 
# After gameEventDate, before gameEventDate, undefined?
# answerDate is assumed to be the gform Timestamp, UTC
# gameEventDate is assumed to be of type pandas._libs.tslib.Timestamp, UTC, from RedMetrics
def getTemporality( answerDate, gameEventDate ):
    result = answerTemporalities[2]
    if(gameEventDate != pd.Timestamp.max.tz_localize('utc')):
        if(answerDate <= gameEventDate):
            result = answerTemporalities[0]
        elif (answerDate > gameEventDate):
            result = answerTemporalities[1]
    return result

score



In [ ]:

    
def getCorrections( _userId, _source = correctAnswers, _form = gform, _columnAnswers = [] ):
    if(len(_columnAnswers) == 0):
        _columnAnswers = getAnswers( _userId, _form = _form )

    if 0 != len(_columnAnswers.columns):

        _questionsCount = len(_columnAnswers.values)

        for _columnName in _columnAnswers.columns:
            if answersColumnNameStem in _columnName:
                _answerNumber = _columnName.replace(answersColumnNameStem,"")
                newCorrectionsColumnName = correctionsColumnNameStem + _answerNumber

                #_columnAnswers[newCorrectionsColumnName] = _columnAnswers[_columnName]
                _columnAnswers[newCorrectionsColumnName] = pd.Series(np.full(_questionsCount, np.nan))

                for question in _columnAnswers[_columnName].index:
                    _correctAnswers = _source.loc[question]
                    
                    if(len(_correctAnswers) > 0):
                        _columnAnswers.loc[question,newCorrectionsColumnName] = False
                        for _correctAnswer in _correctAnswers:
                            if str(_columnAnswers.loc[question,_columnName])\
                            .startswith(str(_correctAnswer)):
                                _columnAnswers.loc[question,newCorrectionsColumnName] = True
                                break
                        

    else:
        # user has never answered
        print("can't give correct answers")
    return _columnAnswers
    

# edits in-place
# _corrections must be a dataframe full of corrections as produced above
def getBinarizedCorrections( _corrections ):
    for _columnName in _corrections.columns:
        for _index in _corrections[_columnName].index:
            if(True==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 1.0
            elif (False==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 0.0
    return _corrections

# only for one line in the gform
def getBinarized(_gformLine, _source = correctAnswers):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)

    _binarized = pd.Series(np.full(len(_gformLine.index), np.nan), index = _gformLine.index)

    for question in _gformLine.index:
        _correctAnswers = _source.loc[question]

        if(len(_correctAnswers) > 0):
            _binarized[question] = 0
            for _correctAnswer in _correctAnswers:
                if str(_gformLine.loc[question])\
                .startswith(str(_correctAnswer)):
                    _binarized.loc[question] = 1
                    break

    _slicedBinarized = _binarized.loc[_notEmptyIndexes]
    return _slicedBinarized

def getAllBinarized(_source = correctAnswers, _form = gform ):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)

    _result = pd.DataFrame(index = _notEmptyIndexes)
    for _userId in getAllResponders( _form = _form ):
        _corrections = getCorrections(_userId, _source=_source, _form = _form)
        _binarized = getBinarizedCorrections(_corrections)
        _slicedBinarized =\
    _binarized.loc[_notEmptyIndexes][_binarized.columns[\
    _binarized.columns.to_series().str.contains(correctionsColumnNameStem)\
                                       ]]

        _result = pd.concat([_result, _slicedBinarized], axis=1)

    _result = _result.T
        
    return _result
    

# CCA.iloc[i,j] is the number of users who correctly answered questions number i and j
# CCA[i,j] = Sum(A[u,i] * A[u,j], u in users) = Sum(tA[i,u] * A[u,j], u in users) = tA.A[i,j]
# CCA[i,j] is an int
def getCrossCorrectAnswers( _binarizedAnswers ):
    return _binarizedAnswers.T.dot(_binarizedAnswers)

#function that returns the score from user id
scoreLabel = 'score'
def getScore( _userId, _form = gform, _source = correctAnswers ):
    _score = pd.DataFrame({}, columns = answerTemporalities)
    _score.loc[scoreLabel,:] = np.nan
    for _column in _score.columns:
        _score.loc[scoreLabel, _column] = []

    if hasAnswered( _userId, _form = _form ):
        _columnAnswers = getCorrections(_userId, _form = _form, _source = _source)
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:
                _answerColumnName = _columnName.replace(correctionsColumnNameStem,\
                                                      answersColumnNameStem)
                _temporality = _columnAnswers.loc['Temporality',_answerColumnName]

                _counts = (_columnAnswers[_columnName]).value_counts()
                _thisScore = 0
                if(True in _counts):
                    _thisScore = _counts[True]
                _score.loc[scoreLabel,_temporality].append(_thisScore)
    else:
        print("user " + str(_userId) + " has never answered")

    return _score


def getGFormRowCorrection( _gformRow, _source = correctAnswers):
    result = _gformRow.copy()

    if(len(_gformRow) == 0):
        print("this gform row is empty")

    else:
        result = pd.Series(index = _gformRow.index, data = np.full(len(_gformRow), np.nan))

        for question in result.index:
            _correctAnswers = _source.loc[question]

            if(len(_correctAnswers) > 0):
                result.loc[question] = False
                for _correctAnswer in _correctAnswers:
                    if str(_gformRow.loc[question]).startswith(str(_correctAnswer)):
                        result.loc[question] = True
                        break
    return result

def getGFormRowScore( _gformRow, _source = correctAnswers):
    correction = getGFormRowCorrection( _gformRow, _source = _source)
    _counts = correction.value_counts()
    _thisScore = 0
    if(True in _counts):
        _thisScore = _counts[True]
    return _thisScore

visualizations



In [ ]:

    
def createStatSet(series, ids = pd.Series()):
    if(0 == len(ids)):
        ids = series.index
    result = {
        'count' : len(ids),
        'unique' : len(ids.unique()),
        'median' : series.median(),
        'mean' : series.mean(),
        'std' : series.std(),
    }
    return result

# _binarized must be well-formed, similarly to getAllBinarized's output
def getPercentagePerQuestion(_binarized):
    totalPerQuestionDF = pd.DataFrame(data=np.dot(np.ones(_binarized.shape[0]), _binarized), index=_binarized.columns)
    percentagePerQuestion = totalPerQuestionDF*100 / _binarized.shape[0]
    return percentagePerQuestion

## sample can be: all, those who answered both before and after,
## those who played between date1 and date2, ...
from scipy.stats import ttest_ind
def plotBasicStats(
    sample,
    title = '',
    includeAll = False,
    includeBefore = True,
    includeAfter = True,
    includeUndefined = False,
    includeProgress = True,
    includeRelativeProgress = False,
):
    
    stepsPerInclude = 2
    includeCount = np.sum([includeAll, includeBefore, includeAfter, includeUndefined, includeProgress])
    stepsCount = stepsPerInclude*includeCount + 3
    
    #print("stepsPerInclude=" + str(stepsPerInclude))
    #print("includeCount=" + str(includeCount))
    #print("stepsCount=" + str(stepsCount))
    
    __progress = FloatProgress(min=0, max=stepsCount)
    display(__progress)
    
    sampleBefore = sample[sample['Temporality'] == 'before']
    sampleAfter = sample[sample['Temporality'] == 'after']
    sampleUndefined = sample[sample['Temporality'] == 'undefined']

    #uniqueBefore = sampleBefore[localplayerguidkey]
    #uniqueAfter = 
    #uniqueUndefined =

    scientificQuestions = correctAnswers.copy()
    allQuestions = correctAnswers + demographicAnswers
    
    categories = ['all', 'before', 'after', 'undefined', 'progress', 'rel. progress']
    data = {}
    
    sciBinarized = pd.DataFrame()
    allBinarized = pd.DataFrame()
    scoresAll = pd.DataFrame()
    
    sciBinarizedBefore = pd.DataFrame()
    allBinarizedBefore = pd.DataFrame()
    scoresBefore = pd.DataFrame()
    
    sciBinarizedAfter = pd.DataFrame()
    allBinarizedAfter = pd.DataFrame()
    scoresAfter = pd.DataFrame()
    
    sciBinarizedUndefined = pd.DataFrame()
    allBinarizedUndefined = pd.DataFrame()
    scoresUndefined = pd.DataFrame()

    scoresProgress = pd.DataFrame()

    ## basic stats:
    ### mean score
    ### median score
    ### std
    if includeAll:
        sciBinarized = getAllBinarized( _source = scientificQuestions, _form = sample)
        __progress.value += 1
        allBinarized = getAllBinarized( _source = allQuestions, _form = sample)
        __progress.value += 1
        scoresAll = pd.Series(np.dot(sciBinarized, np.ones(sciBinarized.shape[1])))
        
        data[categories[0]] = createStatSet(scoresAll, sample[localplayerguidkey])
        
    if includeBefore or includeProgress:
        sciBinarizedBefore = getAllBinarized( _source = scientificQuestions, _form = sampleBefore)
        __progress.value += 1
        allBinarizedBefore = getAllBinarized( _source = allQuestions, _form = sampleBefore)
        __progress.value += 1
        scoresBefore = pd.Series(np.dot(sciBinarizedBefore, np.ones(sciBinarizedBefore.shape[1])))
        temporaryStatSetBefore = createStatSet(scoresBefore, sampleBefore[localplayerguidkey])
    if includeBefore:
        data[categories[1]] = temporaryStatSetBefore
        
    if includeAfter or includeProgress:
        sciBinarizedAfter = getAllBinarized( _source = scientificQuestions, _form = sampleAfter)
        __progress.value += 1
        allBinarizedAfter = getAllBinarized( _source = allQuestions, _form = sampleAfter)
        __progress.value += 1
        scoresAfter = pd.Series(np.dot(sciBinarizedAfter, np.ones(sciBinarizedAfter.shape[1])))
        temporaryStatSetAfter = createStatSet(scoresAfter, sampleAfter[localplayerguidkey])
    if includeAfter:
        data[categories[2]] = temporaryStatSetAfter
        
    if includeUndefined:
        sciBinarizedUndefined = getAllBinarized( _source = scientificQuestions, _form = sampleUndefined)
        __progress.value += 1
        allBinarizedUndefined = getAllBinarized( _source = allQuestions, _form = sampleUndefined)
        __progress.value += 1
        scoresUndefined = pd.Series(np.dot(sciBinarizedUndefined, np.ones(sciBinarizedUndefined.shape[1])))
        
        data[categories[3]] = createStatSet(scoresUndefined, sampleUndefined[localplayerguidkey])

    if includeProgress:
        data[categories[4]] = {
            'count' : min(temporaryStatSetAfter['count'], temporaryStatSetBefore['count']),
            'unique' : min(temporaryStatSetAfter['unique'], temporaryStatSetBefore['unique']),
            'median' : temporaryStatSetAfter['median']-temporaryStatSetBefore['median'],
            'mean' : temporaryStatSetAfter['mean']-temporaryStatSetBefore['mean'],
            'std' : temporaryStatSetAfter['std']-temporaryStatSetBefore['std'],
        }
        __progress.value += 2
    
    
    result = pd.DataFrame(data)
    __progress.value += 1

    print(title)
    print(result)
    if (includeBefore and includeAfter) or includeProgress:
        if (len(scoresBefore) > 2 and len(scoresAfter) > 2):
            ttest = ttest_ind(scoresBefore, scoresAfter)
            print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))
    print()

    ## percentage correct
    ### percentage correct - max 5 columns
    percentagePerQuestionAll = pd.DataFrame()
    percentagePerQuestionBefore = pd.DataFrame()
    percentagePerQuestionAfter = pd.DataFrame()
    percentagePerQuestionUndefined = pd.DataFrame()
    percentagePerQuestionProgress = pd.DataFrame()
    
    tables = []

    if includeAll:
        percentagePerQuestionAll = getPercentagePerQuestion(allBinarized)
        tables.append([percentagePerQuestionAll, categories[0]])
        
    if includeBefore or includeProgress:
        percentagePerQuestionBefore = getPercentagePerQuestion(allBinarizedBefore)
    if includeBefore:
        tables.append([percentagePerQuestionBefore, categories[1]])
        
    if includeAfter or includeProgress:
        percentagePerQuestionAfter = getPercentagePerQuestion(allBinarizedAfter)
    if includeAfter:
        tables.append([percentagePerQuestionAfter, categories[2]])
        
    if includeUndefined:
        percentagePerQuestionUndefined = getPercentagePerQuestion(allBinarizedUndefined)
        tables.append([percentagePerQuestionUndefined, categories[3]])
        
    if includeProgress or includeRelativeProgress:
        percentagePerQuestionProgress = percentagePerQuestionAfter - percentagePerQuestionBefore
        
        if includeProgress:
            tables.append([percentagePerQuestionProgress, categories[4]])
            
        if includeRelativeProgress:
            # use temporaryStatSetAfter['count'], temporaryStatSetBefore['count']?
            percentagePerQuestionProgress2 = percentagePerQuestionProgress.copy()
            for index in range(0,len(percentagePerQuestionProgress.index)):
                if (0 == percentagePerQuestionBefore.iloc[index,0]):
                    percentagePerQuestionProgress2.iloc[index,0] = 0
                else:
                    percentagePerQuestionProgress2.iloc[index,0] = \
                    percentagePerQuestionProgress.iloc[index,0]/percentagePerQuestionBefore.iloc[index,0]
            tables.append([percentagePerQuestionProgress2, categories[5]])
    
    __progress.value += 1

    graphTitle = '% correct: '
    toConcat = []
    
    for table,category in tables:
        concat = (len(table.values) > 0)
        for elt in table.iloc[:,0].values:
            if np.isnan(elt):
                concat = False
                break
        if(concat):
            graphTitle = graphTitle + category + ' '
            toConcat.append(table)

    if (len(toConcat) > 0):
        percentagePerQuestionConcatenated = pd.concat(
            toConcat
            , axis=1)

        if(len(title) > 0):
            graphTitle = graphTitle + ' - ' + title

        _fig = plt.figure(figsize=(20,20))
        _ax1 = plt.subplot(111)
        _ax1.set_title(graphTitle)
        sns.heatmap(percentagePerQuestionConcatenated.round().astype(int),ax=_ax1,cmap=plt.cm.jet,square=True,annot=True,fmt='d')
    __progress.value += 1
    
    ### percentage cross correct
    ### percentage cross correct, conditionnally
    
    if(__progress.value != stepsCount):
        print("__progress.value=" + str(__progress.value) + " != stepsCount=" + str(stepsCount))

    return sciBinarized, sciBinarizedBefore, sciBinarizedAfter, sciBinarizedUndefined, \
            allBinarized, allBinarizedBefore, allBinarizedAfter, allBinarizedUndefined

    
    
def plotCorrelationMatrices(
    allBinarized = [],
    beforeBinarized = [],
    afterBinarized = [],
    undefinedBinarized = [],
    titleAll = 'Correlation of pre- & post-test answers',
    titleBefore = 'Correlation of pre-test answers',
    titleAfter = 'Correlation of post-test answers',
    titleUndefined = 'Correlation of undefined answers',
    titleSuffix = '',
):
    dataBinarized = [allBinarized, beforeBinarized, afterBinarized, undefinedBinarized]
    titles = [titleAll + titleSuffix, titleBefore + titleSuffix, titleAfter + titleSuffix, titleUndefined + titleSuffix]
    
    for index in range(0, len(dataBinarized)):
        if(len(dataBinarized[index]) > 0):
            plotCorrelationMatrix(
                dataBinarized[index],
                _abs=True,
                _clustered=False,
                _questionNumbers=True,
                _annot = True,
                _figsize = (20,20),
                _title=titles[index],
            )
    
##correlation
### simple heatmap
### clustermap
methods = ['pearson', 'kendall', 'spearman']
def plotCorrelationMatrix( 
    _binarizedMatrix, 
    _method = methods[0], 
    _title='Questions\' Correlations', 
    _abs=False, 
    _clustered=False, 
    _questionNumbers=False,
    _annot = False,
    _figsize = (10,10),
    _metric='euclidean'
):
    
    _progress = FloatProgress(min=0, max=7)
    display(_progress)
    
    _overlay = False

    _progress.value += 1
    
    # computation of correlation matrix
    _m = _method
    if(not (_method in methods)):
        _m = methods[0]
    _correlation = _binarizedMatrix.astype(float).corr(_m)
    _progress.value += 1
    if(_abs):
        _correlation = _correlation.abs()
    _progress.value += 1
    
    if(_clustered):
    # removing NaNs
    # can't cluster NaN lines in _correlation
        _notNaNsIndices = []
        _notNaNsColumns = []
        for index in _correlation.index:
            if(~np.isnan(_correlation.loc[index,:]).all()):
                _notNaNsIndices.append(index)
        #for column in _correlation.columns:
        #    if(~np.isnan(_correlation.loc[:,column]).all()):
        #        _notNaNsColumns.append(column)
        
        _binarizedMatrix = _binarizedMatrix.loc[:,_notNaNsIndices]
        _correlation = _correlation.loc[_notNaNsIndices,_notNaNsIndices]
    _progress.value += 1
        
        
    # optional computation of overlay
    if(_annot):
        _overlay = getCrossCorrectAnswers(_binarizedMatrix).astype(int)
    _progress.value += 1
    
    # preparation of plot labels
    if(_questionNumbers):
        _correlation.columns = pd.Series(_correlation.columns).apply(\
                lambda x: x + ' #' + str(_correlation.columns.get_loc(x) + 1))
        if(_clustered):
            _correlation.index = pd.Series(_correlation.columns).apply(\
                lambda x: '#' + str(_correlation.columns.get_loc(x) + 1) + ' ' + x)
        else:
            _correlation.index = _correlation.columns
    _progress.value += 1
    
    # plot
    if(_clustered):
        result = sns.clustermap(\
            _correlation,\
            metric=_metric,\
            cmap=plt.cm.jet,\
            square=True,\
            figsize=_figsize,\
            annot=_overlay,\
            fmt='d')
        return result, _overlay
    
#        if(_annot):
            # reorder columns using clustergrid.dendrogram_col.reordered_ind

            #_overlay1 = _overlay.copy()            

#            reorderedCols = result.dendrogram_col.reordered_ind
#            _overlay = _overlay

            #_overlay2 = _overlay.copy().iloc[reorderedCols,reorderedCols]

#            result = sns.clustermap(_correlation,metric=_metric,cmap=plt.cm.jet,square=True,figsize=_figsize,annot=_overlay, fmt='d')
            
            #print(_overlay1.columns == _overlay2.columns)
            #print(_overlay1 == _overlay2)

            #print(_overlay1.columns)
            #print(_overlay1.columns)
            #print(_overlay1)
            #print(_overlay2)
            
            #return _overlay1, _overlay2
#            return result, _overlay
            
    else:
        _fig = plt.figure(figsize=_figsize)
        _ax = plt.subplot(111)
        _ax.set_title(_title)
        sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True,annot=_overlay, fmt='d')
    _progress.value += 1
    
#def plotAll():
    # loop on question types
    # loop on temporalities
    # loop on representations
    ## basic stats:
    ### mean score
    ### median score
    ### std
    ## percentage correct
    ### percentage correct - 3 columns
    ### percentage cross correct
    ### percentage cross correct, conditionnally
    ##correlation
    ### simple heatmap
#    plotCorrelationMatrix
    ### clustermap
#    plotCorrelationMatrix



In [ ]:

    
def plotSamples(samples):
    _progress = FloatProgress(min=0, max=len(samples))
    display(_progress)

    for sample, title in samples:
        plotBasicStats(sample, title)
        _progress.value += 1

    if(_progress.value != len(samples)):
        print("__progress.value=" + str(__progress.value) + " != len(samples)=" + str(len(samples)))



In [ ]:

    
# for per-gform, manual analysis
def getGFormDataPreview(_GFUserId, sample):
    gforms = gform[gform[localplayerguidkey] == _GFUserId]
    result = {}
    
    for _ilocIndex in range(0, len(gforms)):
        gformsIndex = gforms.index[_ilocIndex]
        currentGForm = gforms.iloc[_ilocIndex]

        subresult = {}
        subresult['date'] = currentGForm['Timestamp']
        subresult['temporality RM'] = currentGForm['Temporality']
        subresult['temporality GF'] = getGFormRowGFormTemporality(currentGForm)
        subresult['score'] = getGFormRowScore(currentGForm)
        subresult['genderAge'] = [currentGForm['What is your gender?'], currentGForm['How old are you?']]

        # search for other users with similar demographics
        matchingDemographics = getMatchingDemographics(sample, currentGForm)
        matchingDemographicsIds = []
        #print(type(matchingDemographics))
        #print(matchingDemographics.index)
        for matchesIndex in matchingDemographics.index:
            matchingDemographicsIds.append([matchesIndex, matchingDemographics.loc[matchesIndex, localplayerguidkey]])

        subresult['demographic matches'] = matchingDemographicsIds

        result['survey' + str(_ilocIndex)] = subresult

    return result

sample getters

set operators



In [ ]:

    
# indices do not need to be reset as they all come from gform
def getUnionQuestionnaires(sample1, sample2):
    if (not (sample1.columns == sample2.columns).all()):
        print("warning: parameter columns are not the same")
    return pd.concat([sample1, sample2]).drop_duplicates()



In [ ]:

    
# indices do not need to be reset as they all come from gform
def getIntersectionQuestionnaires(sample1, sample2):
    if (not (sample1.columns == sample2.columns).all()):
        print("warning: parameter columns are not the same")
    return pd.merge(sample1, sample2, how = 'inner').drop_duplicates()



In [ ]:

    
# get sample1 and sample2 rows where users are common to sample1 and sample2
def getIntersectionUsersSurveys(sample1, sample2):
    result1 = sample1[sample1[localplayerguidkey].isin(sample2[localplayerguidkey])]
    result2 = sample2[sample2[localplayerguidkey].isin(sample1[localplayerguidkey])]
    return getUnionQuestionnaires(result1,result2)

Users who answered either before or after



In [ ]:

    
QPlayed1 = 'Have you ever played an older version of Hero.Coli before?'
QPlayed2 = 'Have you played the current version of Hero.Coli?'
QPlayed3 = 'Have you played the arcade cabinet version of Hero.Coli?'
QPlayed4 = 'Have you played the Android version of Hero.Coli?'



In [ ]:

    
def getRMBefores(sample):
    return sample[sample['Temporality'] == 'before']



In [ ]:

    
def getRMAfters(sample):
    return sample[sample['Temporality'] == 'after']



In [ ]:

    
# returns users who declared that they have never played the game, whatever platform
#  previousPlayPositives is defined in '../Static data/English localization.ipynb'
def getGFormBefores(sample):
    return sample[
      ~sample[QPlayed1].isin(previousPlayPositives)
    & ~sample[QPlayed2].isin(previousPlayPositives)
    & ~sample[QPlayed3].isin(previousPlayPositives)
    & ~sample[QPlayed4].isin(previousPlayPositives)
                ]



In [ ]:

    
def isGFormBefore(surveyAnswerIndex, _gform):
    return (len(getGFormBefores(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)



In [ ]:

    
# returns users who declared that they have already played the game, whatever platform
#  previousPlayPositives is defined in '../Static data/English localization.ipynb'
def getGFormAfters(sample):
    return sample[
      sample[QPlayed1].isin(previousPlayPositives)
    | sample[QPlayed2].isin(previousPlayPositives)
    | sample[QPlayed3].isin(previousPlayPositives)
    | sample[QPlayed4].isin(previousPlayPositives)
                ]



In [ ]:

    
def isGFormAfter(surveyAnswerIndex, _gform):
    return (len(getGFormAfters(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)



In [ ]:

    
# returns an element of answerTemporalities
#  previousPlayPositives is defined in '../Static data/English localization.ipynb'
def getGFormRowGFormTemporality(_gformRow):
    if (_gformRow[QPlayed1] in previousPlayPositives)\
        or (_gformRow[QPlayed2] in previousPlayPositives)\
        or (_gformRow[QPlayed3] in previousPlayPositives)\
        or (_gformRow[QPlayed4] in previousPlayPositives):
        return answerTemporalities[1]
    else:
        return answerTemporalities[0]

Users who answered both before and after



In [ ]:

    
def getSurveysOfUsersWhoAnsweredBoth(sample, gfMode = True, rmMode = False):
    befores = sample
    afters = sample

    if gfMode:
        befores = getGFormBefores(befores)
        afters = getGFormAfters(afters)

    if rmMode:
        befores = getRMBefores(befores)
        afters = getRMAfters(afters)

    return getIntersectionUsersSurveys(befores, afters)



In [ ]:

    
def getSurveysThatAnswered(sample, questionsAndPositiveAnswers, hardPolicy = True):
    filterSeries = []
    if hardPolicy:
        filterSeries = pd.Series(True, sample.index)
        for question, positiveAnswers in questionsAndPositiveAnswers:
            filterSeries = filterSeries & (sample[question].isin(positiveAnswers))
    else:
        filterSeries = pd.Series(False, range(len(sample.index)))
        for question, positiveAnswers in questionsAndPositiveAnswers:
            filterSeries = filterSeries | (sample[question].isin(positiveAnswers))
    return sample[filterSeries]



In [ ]:

    
# surveys of people who have studied biology, and/or know about synthetic biology, and/or about BioBricks
def getSurveysOfBiologists(sample, hardPolicy = True):
    Q6BioEdu = 'How long have you studied biology?' #biologyStudyPositives
    #irrelevant QInterest 'Are you interested in biology?' #biologyInterestPositives
    Q8SynBio = 'Before playing Hero.Coli, had you ever heard about synthetic biology?' #yesNoIdontknowPositives
    Q9BioBricks = 'Before playing Hero.Coli, had you ever heard about BioBricks?' #yesNoIdontknowPositives

    questionsAndPositiveAnswers = [[Q6BioEdu, biologyStudyPositives],
                               [Q8SynBio, yesNoIdontknowPositives],
                               [Q9BioBricks, yesNoIdontknowPositives]]
    
    return getSurveysThatAnswered(sample, questionsAndPositiveAnswers, hardPolicy)



In [ ]:

    
# surveys of people who play video games and/or are interested in them
def getSurveysOfGamers(sample, hardPolicy = True):
    Q2Interest = 'Are you interested in video games?' #interestPositives
    Q3Play = 'Do you play video games?' #frequencyPositives

    questionsAndPositiveAnswers = [[Q2Interest, interestPositives], [Q3Play, frequencyPositives]]
    
    return getSurveysThatAnswered(sample, questionsAndPositiveAnswers, hardPolicy)



In [ ]:

    
def getSurveysWithMatchingAnswers(sample, _gformRow, strictList, extendedList = [], hardPolicy = False):
    questions = strictList

    if (hardPolicy):
        questions += extendedList

    questionsAndPositiveAnswers = []
    for q in questions:
        questionsAndPositiveAnswers.append([q, [_gformRow[q]]])

    return getSurveysThatAnswered(sample, questionsAndPositiveAnswers, True)



In [ ]:

    
def getMatchingDemographics(sample, _gformRow, hardPolicy = False):
    # age and gender
    Q4 = 'How old are you?'
    Q5 = 'What is your gender?'

    # interests, hobbies, and knowledge - evaluation may vary after playing
    Q2Interest = 'Are you interested in video games?'
    Q3Play = 'Do you play video games?'
    Q6BioEdu = 'How long have you studied biology?'
    Q7BioInterest = 'Are you interested in biology?'
    Q8SynBio = 'Before playing Hero.Coli, had you ever heard about synthetic biology?'
    Q9BioBricks = 'Before playing Hero.Coli, had you ever heard about BioBricks?'

    # language may vary: players may have missed the opportunity to set it, or may want to try and change it
    Q42 = 'Language'

    return getSurveysWithMatchingAnswers(
    sample, 
    _gformRow, [Q4, Q5], 
    extendedList = [Q2Interest, Q3Play, Q6BioEdu, Q8SynBio, Q9BioBricks, Q42], 
    hardPolicy = hardPolicy
)

Utility functions to sample



In [ ]:

    
def getDemographicSamples(rootSample):
    samples = [
                [rootSample, 'root sample'],
                [rootSample[rootSample['Language'] == 'en'], 'English'],
                [rootSample[rootSample['Language'] == 'fr'], 'French'],
                [rootSample[rootSample['What is your gender?'] == 'Female'], 'female'],
                [rootSample[rootSample['What is your gender?'] == 'Male'], 'male'],
                [getSurveysOfBiologists(rootSample), 'biologists - strict'],
                [getSurveysOfBiologists(rootSample, False), 'biologists - broad'],
                [getSurveysOfGamers(rootSample), 'gamers - strict'],
                [getSurveysOfGamers(rootSample, False), 'gamers - broad'],
            ]
    return samples



In [ ]:

    
def getTemporalitySamples(rootSample):
    samples = [
                [rootSample, 'root sample'],
        
                [getRMBefores(rootSample), 'RedMetrics befores'],
                [getGFormBefores(rootSample), 'Google form befores'],
                [getRMBefores(getGFormBefores(rootSample)), 'GF & RedMetrics befores'],
        
                [getRMAfters(rootSample), 'RedMetrics afters'],
                [getGFormAfters(rootSample), 'Google form afters'],
                [getRMAfters(getGFormAfters(rootSample)), 'GF & RedMetrics afters'],
        
                [getSurveysOfUsersWhoAnsweredBoth(rootSample, gfMode = True, rmMode = False), 'GF both before and after'],
                [getSurveysOfUsersWhoAnsweredBoth(rootSample, gfMode = False, rmMode = True), 'RM both before and after'],
                [getSurveysOfUsersWhoAnsweredBoth(rootSample, gfMode = True, rmMode = True), 'GF & RM both before and after'],
            ]
    return samples

checkpoint validation



In [ ]:

    
#function that returns the list of checkpoints from user id
def getValidatedCheckpoints( userId, _form = gform ):
    _validatedCheckpoints = []
    
    if hasAnswered( userId, _form = _form ):
        _columnAnswers = getCorrections( userId, _form = _form)
        
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:        
                _questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(len(checkpointQuestionMatching)))

                for _index in range(0, len(_questionnaireValidatedCheckpointsPerQuestion)):
                    if _columnAnswers[_columnName][_index]==True:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = checkpointQuestionMatching['checkpoint'][_index]
                    else:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = ''

                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpointsPerQuestion.unique()
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints[_questionnaireValidatedCheckpoints!='']
                _questionnaireValidatedCheckpoints = pd.Series(_questionnaireValidatedCheckpoints)
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints.sort_values()
                _questionnaireValidatedCheckpoints.index = range(0, len(_questionnaireValidatedCheckpoints))
                
                _validatedCheckpoints.append(_questionnaireValidatedCheckpoints) 
    else:
        print("user " + str(userId) + " has never answered")
    return pd.Series(_validatedCheckpoints)

def getValidatedCheckpointsCounts( _userId, _form = gform ):
    _validatedCheckpoints = getValidatedCheckpoints(_userId, _form = _form)
    _counts = []
    for checkpointsList in _validatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

def getNonValidated( checkpoints ):
    _validationLists = []
    
    if 0!=len(checkpoints):
        for _validation in checkpoints:
            _result = pd.Series(np.setdiff1d(validableCheckpoints.values, _validation.values))
            _result = _result[_result != '']
            _result.index = range(0, len(_result))
            _validationLists.append(_result)
        return pd.Series(_validationLists)
    else:
        return validableCheckpoints

def getNonValidatedCheckpoints( userId, _form = gform ):
    validated = getValidatedCheckpoints( userId, _form = _form )
    return getNonValidated(validated)

def getNonValidatedCheckpointsCounts( userId, _form = gform ):
    _nonValidatedCheckpoints = getNonValidatedCheckpoints(userId, _form = _form)
    _counts = []
    for checkpointsList in _nonValidatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

p(answered question N | answered question P)



In [ ]:

    
# returns all rows of Google form's answers that contain an element 
#   of the array 'choice' for question number 'questionIndex'
def getAllAnswerRows(questionIndex, choice, _form = gform ):
    return _form[_form.iloc[:, questionIndex].isin(choice)]

def getPercentCorrectPerColumn(_df):
    _count = len(_df)
    _percents = pd.Series(np.full(len(_df.columns), np.nan), index=_df.columns)
    for _rowIndex in _df.index:
        for _columnName in _df.columns:
            _columnIndex = _df.columns.get_loc(_columnName)
            if ((_columnIndex >= firstEvaluationQuestionIndex) \
                and (_columnIndex < len(_df.columns)-3)):
                if(str(_df[_columnName][_rowIndex]).startswith(str(correctAnswers[_columnIndex]))):
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 1;
                    else:
                        _percents[_columnName] = _percents[_columnName]+1
                else:
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 0;
                
    _percents = _percents/_count
    _percents['Count'] = _count
    return _percents

def getPercentCorrectKnowingAnswer(questionIndex, choice, _form = gform):
    _answerRows = getAllAnswerRows(questionIndex, choice, _form = _form);
    return getPercentCorrectPerColumn(_answerRows)

Filtering users



In [ ]:

    
def getTestAnswers( _form = gform, _rmDF = rmdf152, _rmTestDF = normalizedRMDFTest, includeAndroid = True):
    return _form[_form[localplayerguidkey].isin(testUsers.values.flatten())]

Initialization of gform



In [ ]:

    
setAnswerTemporalities()

Google form analysis

Table of Contents

Preparation

Constants

Functions

general purpose

sessions and temporalities

score

visualizations

sample getters

set operators

Users who answered either before or after

Users who answered both before and after

Utility functions to sample

checkpoint validation

p(answered question N | answered question P)

Filtering users

Initialization of gform