Google form analysis

Analysis of results extracted from Google forms in csv format.

Preparation


In [ ]:
%run "../Functions/2. Game sessions.ipynb"

Constants


In [ ]:
# special user ids
userIDThatDidNotAnswer = '001c95c6-8207-43dc-a51b-adf0c6e005d7'

userID1AnswerEN = '00dbbdca-d86c-4bc9-803c-0602e0153f68'
userIDAnswersEN = '5977184a-1be2-4725-9b48-f2782dc03efb'
userID1ScoreEN = '6b5d392d-b737-49ef-99af-e8c445ff6379'
userIDScoresEN = '5ecf601d-4eac-433e-8056-3a5b9eda0555'

userID1AnswerFR = '2734a37d-4ba5-454f-bf85-1f7b767138f6'
userIDAnswersFR = '01e85778-2903-447b-bbab-dd750564ee2d'
userID1ScoreFR = '3d733347-0313-441a-b77c-3e4046042a53'
userIDScoresFR = '58d22690-8604-41cf-a5b7-d71fb3b9ad5b'

userIDAnswersENFR = 'a7936587-8b71-43b6-9c61-17b2c2b55de3'

In [ ]:
#localplayerguidkey = 'Ne pas modifier - identifiant anonyme prérempli'
localplayerguidkey = 'Do not edit -  pre-filled anonymous ID'
localplayerguidindex = gform.columns.get_loc(localplayerguidkey)
localplayerguidindex

In [ ]:
firstEvaluationQuestionKey = 'In order to modify the abilities of the bacterium, you have to...'
firstEvaluationQuestionIndex = gform.columns.get_loc(firstEvaluationQuestionKey)
firstEvaluationQuestionIndex

In [ ]:
answersColumnNameStem = "answers"
correctionsColumnNameStem = "corrections"

Functions

general purpose


In [ ]:
def getUniqueUserCount(sample):
    return sample[localplayerguidkey].nunique()

In [ ]:
def getAllResponders( _form = gform ):
    userIds = _form[localplayerguidkey].unique()
    return userIds

def getRandomGFormGUID():
    _uniqueUsers = getAllResponders()
    _userCount = len(_uniqueUsers)
    _guid = '0'
    while (not isGUIDFormat(_guid)):
        _userIndex = randint(0,_userCount-1)
        _guid = _uniqueUsers[_userIndex]
    return _guid

def hasAnswered( userId, _form = gform ):
    return userId in _form[localplayerguidkey].values

def getAnswers( userId, _form = gform ):
    answers = _form[_form[localplayerguidkey]==userId]
    _columnAnswers = answers.T
    
    if 0 != len(answers):
        _newColumns = []
        for column in _columnAnswers.columns:
            _newColumns.append(answersColumnNameStem + str(column))
        _columnAnswers.columns = _newColumns
    else:
        # user has never answered
        print("user " + str(userId) + " has never answered")
        
    return _columnAnswers

sessions and temporalities


In [ ]:
def setAnswerTemporalities( _gformDF = gform ):
    # check whether temporalities have already been set
    if(len(_gformDF['Temporality'].unique()) == 1):
        # format : key = _userId, value = [_firstEventDate, 0 or _gformDF.index of before, 0 or _gformDF.index of after]
        temporalities = {}

        for _index in _gformDF.index:
            _userId = _gformDF.loc[_index,localplayerguidkey]
            _firstEventDate, beforeIndex, afterIndex = [0,0,0]

            if _userId in temporalities:
                _firstEventDate, beforeIndex, afterIndex = temporalities[_userId]
            else:
                _firstEventDate = getFirstEventDate(_userId)

            temporality = getTemporality(_gformDF.loc[_index,'Timestamp'],_firstEventDate)

            if temporality == answerTemporalities[0] and beforeIndex != 0 :
                if _gformDF.loc[_index,'Timestamp'] > _gformDF.loc[beforeIndex,'Timestamp']:
                    _gformDF.loc[beforeIndex,'Temporality'] = answerTemporalities[2]
                else:
                    temporality = answerTemporalities[2]
            elif temporality == answerTemporalities[1] and afterIndex != 0 :
                if _gformDF.loc[_index,'Timestamp'] < _gformDF.loc[afterIndex,'Timestamp']:
                    _gformDF.loc[afterIndex,'Temporality'] = answerTemporalities[2]
                else:
                    temporality = answerTemporalities[2]

            _gformDF.loc[_index,'Temporality'] = temporality
            if temporality == answerTemporalities[0]:
                beforeIndex = _index
            elif temporality == answerTemporalities[1]:
                afterIndex = _index

            temporalities[_userId] = [_firstEventDate, beforeIndex, afterIndex]
        print("temporalities set")

# when did the user answer the questionnaire? 
# After gameEventDate, before gameEventDate, undefined?
# answerDate is assumed to be the gform Timestamp, UTC
# gameEventDate is assumed to be of type pandas._libs.tslib.Timestamp, UTC, from RedMetrics
def getTemporality( answerDate, gameEventDate ):
    result = answerTemporalities[2]
    if(gameEventDate != pd.Timestamp.max.tz_localize('utc')):
        if(answerDate <= gameEventDate):
            result = answerTemporalities[0]
        elif (answerDate > gameEventDate):
            result = answerTemporalities[1]
    return result

score


In [ ]:
def getCorrections( _userId, _source = correctAnswers, _form = gform, _columnAnswers = [] ):
    if(len(_columnAnswers) == 0):
        _columnAnswers = getAnswers( _userId, _form = _form )

    if 0 != len(_columnAnswers.columns):

        _questionsCount = len(_columnAnswers.values)

        for _columnName in _columnAnswers.columns:
            if answersColumnNameStem in _columnName:
                _answerNumber = _columnName.replace(answersColumnNameStem,"")
                newCorrectionsColumnName = correctionsColumnNameStem + _answerNumber

                #_columnAnswers[newCorrectionsColumnName] = _columnAnswers[_columnName]
                _columnAnswers[newCorrectionsColumnName] = pd.Series(np.full(_questionsCount, np.nan))

                for question in _columnAnswers[_columnName].index:
                    _correctAnswers = _source.loc[question]
                    
                    if(len(_correctAnswers) > 0):
                        _columnAnswers.loc[question,newCorrectionsColumnName] = False
                        for _correctAnswer in _correctAnswers:
                            if str(_columnAnswers.loc[question,_columnName])\
                            .startswith(str(_correctAnswer)):
                                _columnAnswers.loc[question,newCorrectionsColumnName] = True
                                break
                        

    else:
        # user has never answered
        print("can't give correct answers")
    return _columnAnswers
    

# edits in-place
# _corrections must be a dataframe full of corrections as produced above
def getBinarizedCorrections( _corrections ):
    for _columnName in _corrections.columns:
        for _index in _corrections[_columnName].index:
            if(True==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 1.0
            elif (False==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 0.0
    return _corrections

# only for one line in the gform
def getBinarized(_gformLine, _source = correctAnswers):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)

    _binarized = pd.Series(np.full(len(_gformLine.index), np.nan), index = _gformLine.index)

    for question in _gformLine.index:
        _correctAnswers = _source.loc[question]

        if(len(_correctAnswers) > 0):
            _binarized[question] = 0
            for _correctAnswer in _correctAnswers:
                if str(_gformLine.loc[question])\
                .startswith(str(_correctAnswer)):
                    _binarized.loc[question] = 1
                    break

    _slicedBinarized = _binarized.loc[_notEmptyIndexes]
    return _slicedBinarized

def getAllBinarized(_source = correctAnswers, _form = gform ):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)

    _result = pd.DataFrame(index = _notEmptyIndexes)
    for _userId in getAllResponders( _form = _form ):
        _corrections = getCorrections(_userId, _source=_source, _form = _form)
        _binarized = getBinarizedCorrections(_corrections)
        _slicedBinarized =\
    _binarized.loc[_notEmptyIndexes][_binarized.columns[\
    _binarized.columns.to_series().str.contains(correctionsColumnNameStem)\
                                       ]]

        _result = pd.concat([_result, _slicedBinarized], axis=1)

    _result = _result.T
        
    return _result
    

# CCA.iloc[i,j] is the number of users who correctly answered questions number i and j
# CCA[i,j] = Sum(A[u,i] * A[u,j], u in users) = Sum(tA[i,u] * A[u,j], u in users) = tA.A[i,j]
# CCA[i,j] is an int
def getCrossCorrectAnswers( _binarizedAnswers ):
    return _binarizedAnswers.T.dot(_binarizedAnswers)

#function that returns the score from user id
scoreLabel = 'score'
def getScore( _userId, _form = gform, _source = correctAnswers ):
    _score = pd.DataFrame({}, columns = answerTemporalities)
    _score.loc[scoreLabel,:] = np.nan
    for _column in _score.columns:
        _score.loc[scoreLabel, _column] = []

    if hasAnswered( _userId, _form = _form ):
        _columnAnswers = getCorrections(_userId, _form = _form, _source = _source)
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:
                _answerColumnName = _columnName.replace(correctionsColumnNameStem,\
                                                      answersColumnNameStem)
                _temporality = _columnAnswers.loc['Temporality',_answerColumnName]

                _counts = (_columnAnswers[_columnName]).value_counts()
                _thisScore = 0
                if(True in _counts):
                    _thisScore = _counts[True]
                _score.loc[scoreLabel,_temporality].append(_thisScore)
    else:
        print("user " + str(_userId) + " has never answered")

    return _score


def getGFormRowCorrection( _gformRow, _source = correctAnswers):
    result = _gformRow.copy()

    if(len(_gformRow) == 0):
        print("this gform row is empty")

    else:
        result = pd.Series(index = _gformRow.index, data = np.full(len(_gformRow), np.nan))

        for question in result.index:
            _correctAnswers = _source.loc[question]

            if(len(_correctAnswers) > 0):
                result.loc[question] = False
                for _correctAnswer in _correctAnswers:
                    if str(_gformRow.loc[question]).startswith(str(_correctAnswer)):
                        result.loc[question] = True
                        break
    return result

def getGFormRowScore( _gformRow, _source = correctAnswers):
    correction = getGFormRowCorrection( _gformRow, _source = _source)
    _counts = correction.value_counts()
    _thisScore = 0
    if(True in _counts):
        _thisScore = _counts[True]
    return _thisScore

visualizations


In [ ]:
def createStatSet(series, ids = pd.Series()):
    if(0 == len(ids)):
        ids = series.index
    result = {
        'count' : len(ids),
        'unique' : len(ids.unique()),
        'median' : series.median(),
        'mean' : series.mean(),
        'std' : series.std(),
    }
    return result

# _binarized must be well-formed, similarly to getAllBinarized's output
def getPercentagePerQuestion(_binarized):
    totalPerQuestionDF = pd.DataFrame(data=np.dot(np.ones(_binarized.shape[0]), _binarized), index=_binarized.columns)
    percentagePerQuestion = totalPerQuestionDF*100 / _binarized.shape[0]
    return percentagePerQuestion

## sample can be: all, those who answered both before and after,
## those who played between date1 and date2, ...
from scipy.stats import ttest_ind
def plotBasicStats(
    sample,
    title = '',
    includeAll = False,
    includeBefore = True,
    includeAfter = True,
    includeUndefined = False,
    includeProgress = True,
    includeRelativeProgress = False,
):
    
    stepsPerInclude = 2
    includeCount = np.sum([includeAll, includeBefore, includeAfter, includeUndefined, includeProgress])
    stepsCount = stepsPerInclude*includeCount + 3
    
    #print("stepsPerInclude=" + str(stepsPerInclude))
    #print("includeCount=" + str(includeCount))
    #print("stepsCount=" + str(stepsCount))
    
    __progress = FloatProgress(min=0, max=stepsCount)
    display(__progress)
    
    sampleBefore = sample[sample['Temporality'] == 'before']
    sampleAfter = sample[sample['Temporality'] == 'after']
    sampleUndefined = sample[sample['Temporality'] == 'undefined']

    #uniqueBefore = sampleBefore[localplayerguidkey]
    #uniqueAfter = 
    #uniqueUndefined =

    scientificQuestions = correctAnswers.copy()
    allQuestions = correctAnswers + demographicAnswers
    
    categories = ['all', 'before', 'after', 'undefined', 'progress', 'rel. progress']
    data = {}
    
    sciBinarized = pd.DataFrame()
    allBinarized = pd.DataFrame()
    scoresAll = pd.DataFrame()
    
    sciBinarizedBefore = pd.DataFrame()
    allBinarizedBefore = pd.DataFrame()
    scoresBefore = pd.DataFrame()
    
    sciBinarizedAfter = pd.DataFrame()
    allBinarizedAfter = pd.DataFrame()
    scoresAfter = pd.DataFrame()
    
    sciBinarizedUndefined = pd.DataFrame()
    allBinarizedUndefined = pd.DataFrame()
    scoresUndefined = pd.DataFrame()

    scoresProgress = pd.DataFrame()

    ## basic stats:
    ### mean score
    ### median score
    ### std
    if includeAll:
        sciBinarized = getAllBinarized( _source = scientificQuestions, _form = sample)
        __progress.value += 1
        allBinarized = getAllBinarized( _source = allQuestions, _form = sample)
        __progress.value += 1
        scoresAll = pd.Series(np.dot(sciBinarized, np.ones(sciBinarized.shape[1])))
        
        data[categories[0]] = createStatSet(scoresAll, sample[localplayerguidkey])
        
    if includeBefore or includeProgress:
        sciBinarizedBefore = getAllBinarized( _source = scientificQuestions, _form = sampleBefore)
        __progress.value += 1
        allBinarizedBefore = getAllBinarized( _source = allQuestions, _form = sampleBefore)
        __progress.value += 1
        scoresBefore = pd.Series(np.dot(sciBinarizedBefore, np.ones(sciBinarizedBefore.shape[1])))
        temporaryStatSetBefore = createStatSet(scoresBefore, sampleBefore[localplayerguidkey])
    if includeBefore:
        data[categories[1]] = temporaryStatSetBefore
        
    if includeAfter or includeProgress:
        sciBinarizedAfter = getAllBinarized( _source = scientificQuestions, _form = sampleAfter)
        __progress.value += 1
        allBinarizedAfter = getAllBinarized( _source = allQuestions, _form = sampleAfter)
        __progress.value += 1
        scoresAfter = pd.Series(np.dot(sciBinarizedAfter, np.ones(sciBinarizedAfter.shape[1])))
        temporaryStatSetAfter = createStatSet(scoresAfter, sampleAfter[localplayerguidkey])
    if includeAfter:
        data[categories[2]] = temporaryStatSetAfter
        
    if includeUndefined:
        sciBinarizedUndefined = getAllBinarized( _source = scientificQuestions, _form = sampleUndefined)
        __progress.value += 1
        allBinarizedUndefined = getAllBinarized( _source = allQuestions, _form = sampleUndefined)
        __progress.value += 1
        scoresUndefined = pd.Series(np.dot(sciBinarizedUndefined, np.ones(sciBinarizedUndefined.shape[1])))
        
        data[categories[3]] = createStatSet(scoresUndefined, sampleUndefined[localplayerguidkey])

    if includeProgress:
        data[categories[4]] = {
            'count' : min(temporaryStatSetAfter['count'], temporaryStatSetBefore['count']),
            'unique' : min(temporaryStatSetAfter['unique'], temporaryStatSetBefore['unique']),
            'median' : temporaryStatSetAfter['median']-temporaryStatSetBefore['median'],
            'mean' : temporaryStatSetAfter['mean']-temporaryStatSetBefore['mean'],
            'std' : temporaryStatSetAfter['std']-temporaryStatSetBefore['std'],
        }
        __progress.value += 2
    
    
    result = pd.DataFrame(data)
    __progress.value += 1

    print(title)
    print(result)
    if (includeBefore and includeAfter) or includeProgress:
        if (len(scoresBefore) > 2 and len(scoresAfter) > 2):
            ttest = ttest_ind(scoresBefore, scoresAfter)
            print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))
    print()

    ## percentage correct
    ### percentage correct - max 5 columns
    percentagePerQuestionAll = pd.DataFrame()
    percentagePerQuestionBefore = pd.DataFrame()
    percentagePerQuestionAfter = pd.DataFrame()
    percentagePerQuestionUndefined = pd.DataFrame()
    percentagePerQuestionProgress = pd.DataFrame()
    
    tables = []

    if includeAll:
        percentagePerQuestionAll = getPercentagePerQuestion(allBinarized)
        tables.append([percentagePerQuestionAll, categories[0]])
        
    if includeBefore or includeProgress:
        percentagePerQuestionBefore = getPercentagePerQuestion(allBinarizedBefore)
    if includeBefore:
        tables.append([percentagePerQuestionBefore, categories[1]])
        
    if includeAfter or includeProgress:
        percentagePerQuestionAfter = getPercentagePerQuestion(allBinarizedAfter)
    if includeAfter:
        tables.append([percentagePerQuestionAfter, categories[2]])
        
    if includeUndefined:
        percentagePerQuestionUndefined = getPercentagePerQuestion(allBinarizedUndefined)
        tables.append([percentagePerQuestionUndefined, categories[3]])
        
    if includeProgress or includeRelativeProgress:
        percentagePerQuestionProgress = percentagePerQuestionAfter - percentagePerQuestionBefore
        
        if includeProgress:
            tables.append([percentagePerQuestionProgress, categories[4]])
            
        if includeRelativeProgress:
            # use temporaryStatSetAfter['count'], temporaryStatSetBefore['count']?
            percentagePerQuestionProgress2 = percentagePerQuestionProgress.copy()
            for index in range(0,len(percentagePerQuestionProgress.index)):
                if (0 == percentagePerQuestionBefore.iloc[index,0]):
                    percentagePerQuestionProgress2.iloc[index,0] = 0
                else:
                    percentagePerQuestionProgress2.iloc[index,0] = \
                    percentagePerQuestionProgress.iloc[index,0]/percentagePerQuestionBefore.iloc[index,0]
            tables.append([percentagePerQuestionProgress2, categories[5]])
    
    __progress.value += 1

    graphTitle = '% correct: '
    toConcat = []
    
    for table,category in tables:
        concat = (len(table.values) > 0)
        for elt in table.iloc[:,0].values:
            if np.isnan(elt):
                concat = False
                break
        if(concat):
            graphTitle = graphTitle + category + ' '
            toConcat.append(table)

    if (len(toConcat) > 0):
        percentagePerQuestionConcatenated = pd.concat(
            toConcat
            , axis=1)

        if(len(title) > 0):
            graphTitle = graphTitle + ' - ' + title

        _fig = plt.figure(figsize=(20,20))
        _ax1 = plt.subplot(111)
        _ax1.set_title(graphTitle)
        sns.heatmap(percentagePerQuestionConcatenated.round().astype(int),ax=_ax1,cmap=plt.cm.jet,square=True,annot=True,fmt='d')
    __progress.value += 1
    
    ### percentage cross correct
    ### percentage cross correct, conditionnally
    
    if(__progress.value != stepsCount):
        print("__progress.value=" + str(__progress.value) + " != stepsCount=" + str(stepsCount))

    return sciBinarized, sciBinarizedBefore, sciBinarizedAfter, sciBinarizedUndefined, \
            allBinarized, allBinarizedBefore, allBinarizedAfter, allBinarizedUndefined

    
    
def plotCorrelationMatrices(
    allBinarized = [],
    beforeBinarized = [],
    afterBinarized = [],
    undefinedBinarized = [],
    titleAll = 'Correlation of pre- & post-test answers',
    titleBefore = 'Correlation of pre-test answers',
    titleAfter = 'Correlation of post-test answers',
    titleUndefined = 'Correlation of undefined answers',
    titleSuffix = '',
):
    dataBinarized = [allBinarized, beforeBinarized, afterBinarized, undefinedBinarized]
    titles = [titleAll + titleSuffix, titleBefore + titleSuffix, titleAfter + titleSuffix, titleUndefined + titleSuffix]
    
    for index in range(0, len(dataBinarized)):
        if(len(dataBinarized[index]) > 0):
            plotCorrelationMatrix(
                dataBinarized[index],
                _abs=True,
                _clustered=False,
                _questionNumbers=True,
                _annot = True,
                _figsize = (20,20),
                _title=titles[index],
            )
    
##correlation
### simple heatmap
### clustermap
methods = ['pearson', 'kendall', 'spearman']
def plotCorrelationMatrix( 
    _binarizedMatrix, 
    _method = methods[0], 
    _title='Questions\' Correlations', 
    _abs=False, 
    _clustered=False, 
    _questionNumbers=False,
    _annot = False,
    _figsize = (10,10),
    _metric='euclidean'
):
    
    _progress = FloatProgress(min=0, max=7)
    display(_progress)
    
    _overlay = False

    _progress.value += 1
    
    # computation of correlation matrix
    _m = _method
    if(not (_method in methods)):
        _m = methods[0]
    _correlation = _binarizedMatrix.astype(float).corr(_m)
    _progress.value += 1
    if(_abs):
        _correlation = _correlation.abs()
    _progress.value += 1
    
    if(_clustered):
    # removing NaNs
    # can't cluster NaN lines in _correlation
        _notNaNsIndices = []
        _notNaNsColumns = []
        for index in _correlation.index:
            if(~np.isnan(_correlation.loc[index,:]).all()):
                _notNaNsIndices.append(index)
        #for column in _correlation.columns:
        #    if(~np.isnan(_correlation.loc[:,column]).all()):
        #        _notNaNsColumns.append(column)
        
        _binarizedMatrix = _binarizedMatrix.loc[:,_notNaNsIndices]
        _correlation = _correlation.loc[_notNaNsIndices,_notNaNsIndices]
    _progress.value += 1
        
        
    # optional computation of overlay
    if(_annot):
        _overlay = getCrossCorrectAnswers(_binarizedMatrix).astype(int)
    _progress.value += 1
    
    # preparation of plot labels
    if(_questionNumbers):
        _correlation.columns = pd.Series(_correlation.columns).apply(\
                lambda x: x + ' #' + str(_correlation.columns.get_loc(x) + 1))
        if(_clustered):
            _correlation.index = pd.Series(_correlation.columns).apply(\
                lambda x: '#' + str(_correlation.columns.get_loc(x) + 1) + ' ' + x)
        else:
            _correlation.index = _correlation.columns
    _progress.value += 1
    
    # plot
    if(_clustered):
        result = sns.clustermap(\
            _correlation,\
            metric=_metric,\
            cmap=plt.cm.jet,\
            square=True,\
            figsize=_figsize,\
            annot=_overlay,\
            fmt='d')
        return result, _overlay
    
#        if(_annot):
            # reorder columns using clustergrid.dendrogram_col.reordered_ind

            #_overlay1 = _overlay.copy()            

#            reorderedCols = result.dendrogram_col.reordered_ind
#            _overlay = _overlay

            #_overlay2 = _overlay.copy().iloc[reorderedCols,reorderedCols]

#            result = sns.clustermap(_correlation,metric=_metric,cmap=plt.cm.jet,square=True,figsize=_figsize,annot=_overlay, fmt='d')
            
            #print(_overlay1.columns == _overlay2.columns)
            #print(_overlay1 == _overlay2)

            #print(_overlay1.columns)
            #print(_overlay1.columns)
            #print(_overlay1)
            #print(_overlay2)
            
            #return _overlay1, _overlay2
#            return result, _overlay
            
    else:
        _fig = plt.figure(figsize=_figsize)
        _ax = plt.subplot(111)
        _ax.set_title(_title)
        sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True,annot=_overlay, fmt='d')
    _progress.value += 1
    
#def plotAll():
    # loop on question types
    # loop on temporalities
    # loop on representations
    ## basic stats:
    ### mean score
    ### median score
    ### std
    ## percentage correct
    ### percentage correct - 3 columns
    ### percentage cross correct
    ### percentage cross correct, conditionnally
    ##correlation
    ### simple heatmap
#    plotCorrelationMatrix
    ### clustermap
#    plotCorrelationMatrix

In [ ]:
def plotSamples(samples):
    _progress = FloatProgress(min=0, max=len(samples))
    display(_progress)

    for sample, title in samples:
        plotBasicStats(sample, title)
        _progress.value += 1

    if(_progress.value != len(samples)):
        print("__progress.value=" + str(__progress.value) + " != len(samples)=" + str(len(samples)))

In [ ]:
# for per-gform, manual analysis
def getGFormDataPreview(_GFUserId, sample):
    gforms = gform[gform[localplayerguidkey] == _GFUserId]
    result = {}
    
    for _ilocIndex in range(0, len(gforms)):
        gformsIndex = gforms.index[_ilocIndex]
        currentGForm = gforms.iloc[_ilocIndex]

        subresult = {}
        subresult['date'] = currentGForm['Timestamp']
        subresult['temporality RM'] = currentGForm['Temporality']
        subresult['temporality GF'] = getGFormRowGFormTemporality(currentGForm)
        subresult['score'] = getGFormRowScore(currentGForm)
        subresult['genderAge'] = [currentGForm['What is your gender?'], currentGForm['How old are you?']]

        # search for other users with similar demographics
        matchingDemographics = getMatchingDemographics(sample, currentGForm)
        matchingDemographicsIds = []
        #print(type(matchingDemographics))
        #print(matchingDemographics.index)
        for matchesIndex in matchingDemographics.index:
            matchingDemographicsIds.append([matchesIndex, matchingDemographics.loc[matchesIndex, localplayerguidkey]])

        subresult['demographic matches'] = matchingDemographicsIds

        result['survey' + str(_ilocIndex)] = subresult

    return result

sample getters

set operators


In [ ]:
# indices do not need to be reset as they all come from gform
def getUnionQuestionnaires(sample1, sample2):
    if (not (sample1.columns == sample2.columns).all()):
        print("warning: parameter columns are not the same")
    return pd.concat([sample1, sample2]).drop_duplicates()

In [ ]:
# indices do not need to be reset as they all come from gform
def getIntersectionQuestionnaires(sample1, sample2):
    if (not (sample1.columns == sample2.columns).all()):
        print("warning: parameter columns are not the same")
    return pd.merge(sample1, sample2, how = 'inner').drop_duplicates()

In [ ]:
# get sample1 and sample2 rows where users are common to sample1 and sample2
def getIntersectionUsersSurveys(sample1, sample2):
    result1 = sample1[sample1[localplayerguidkey].isin(sample2[localplayerguidkey])]
    result2 = sample2[sample2[localplayerguidkey].isin(sample1[localplayerguidkey])]
    return getUnionQuestionnaires(result1,result2)

Users who answered either before or after


In [ ]:
QPlayed1 = 'Have you ever played an older version of Hero.Coli before?'
QPlayed2 = 'Have you played the current version of Hero.Coli?'
QPlayed3 = 'Have you played the arcade cabinet version of Hero.Coli?'
QPlayed4 = 'Have you played the Android version of Hero.Coli?'

In [ ]:
def getRMBefores(sample):
    return sample[sample['Temporality'] == 'before']

In [ ]:
def getRMAfters(sample):
    return sample[sample['Temporality'] == 'after']

In [ ]:
# returns users who declared that they have never played the game, whatever platform
#  previousPlayPositives is defined in '../Static data/English localization.ipynb'
def getGFormBefores(sample):
    return sample[
      ~sample[QPlayed1].isin(previousPlayPositives)
    & ~sample[QPlayed2].isin(previousPlayPositives)
    & ~sample[QPlayed3].isin(previousPlayPositives)
    & ~sample[QPlayed4].isin(previousPlayPositives)
                ]

In [ ]:
def isGFormBefore(surveyAnswerIndex, _gform):
    return (len(getGFormBefores(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)

In [ ]:
# returns users who declared that they have already played the game, whatever platform
#  previousPlayPositives is defined in '../Static data/English localization.ipynb'
def getGFormAfters(sample):
    return sample[
      sample[QPlayed1].isin(previousPlayPositives)
    | sample[QPlayed2].isin(previousPlayPositives)
    | sample[QPlayed3].isin(previousPlayPositives)
    | sample[QPlayed4].isin(previousPlayPositives)
                ]

In [ ]:
def isGFormAfter(surveyAnswerIndex, _gform):
    return (len(getGFormAfters(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)

In [ ]:
# returns an element of answerTemporalities
#  previousPlayPositives is defined in '../Static data/English localization.ipynb'
def getGFormRowGFormTemporality(_gformRow):
    if (_gformRow[QPlayed1] in previousPlayPositives)\
        or (_gformRow[QPlayed2] in previousPlayPositives)\
        or (_gformRow[QPlayed3] in previousPlayPositives)\
        or (_gformRow[QPlayed4] in previousPlayPositives):
        return answerTemporalities[1]
    else:
        return answerTemporalities[0]

Users who answered both before and after


In [ ]:
def getSurveysOfUsersWhoAnsweredBoth(sample, gfMode = True, rmMode = False):
    befores = sample
    afters = sample

    if gfMode:
        befores = getGFormBefores(befores)
        afters = getGFormAfters(afters)

    if rmMode:
        befores = getRMBefores(befores)
        afters = getRMAfters(afters)

    return getIntersectionUsersSurveys(befores, afters)

In [ ]:
def getSurveysThatAnswered(sample, questionsAndPositiveAnswers, hardPolicy = True):
    filterSeries = []
    if hardPolicy:
        filterSeries = pd.Series(True, sample.index)
        for question, positiveAnswers in questionsAndPositiveAnswers:
            filterSeries = filterSeries & (sample[question].isin(positiveAnswers))
    else:
        filterSeries = pd.Series(False, range(len(sample.index)))
        for question, positiveAnswers in questionsAndPositiveAnswers:
            filterSeries = filterSeries | (sample[question].isin(positiveAnswers))
    return sample[filterSeries]

In [ ]:
# surveys of people who have studied biology, and/or know about synthetic biology, and/or about BioBricks
def getSurveysOfBiologists(sample, hardPolicy = True):
    Q6BioEdu = 'How long have you studied biology?' #biologyStudyPositives
    #irrelevant QInterest 'Are you interested in biology?' #biologyInterestPositives
    Q8SynBio = 'Before playing Hero.Coli, had you ever heard about synthetic biology?' #yesNoIdontknowPositives
    Q9BioBricks = 'Before playing Hero.Coli, had you ever heard about BioBricks?' #yesNoIdontknowPositives

    questionsAndPositiveAnswers = [[Q6BioEdu, biologyStudyPositives],
                               [Q8SynBio, yesNoIdontknowPositives],
                               [Q9BioBricks, yesNoIdontknowPositives]]
    
    return getSurveysThatAnswered(sample, questionsAndPositiveAnswers, hardPolicy)

In [ ]:
# surveys of people who play video games and/or are interested in them
def getSurveysOfGamers(sample, hardPolicy = True):
    Q2Interest = 'Are you interested in video games?' #interestPositives
    Q3Play = 'Do you play video games?' #frequencyPositives

    questionsAndPositiveAnswers = [[Q2Interest, interestPositives], [Q3Play, frequencyPositives]]
    
    return getSurveysThatAnswered(sample, questionsAndPositiveAnswers, hardPolicy)

In [ ]:
def getSurveysWithMatchingAnswers(sample, _gformRow, strictList, extendedList = [], hardPolicy = False):
    questions = strictList

    if (hardPolicy):
        questions += extendedList

    questionsAndPositiveAnswers = []
    for q in questions:
        questionsAndPositiveAnswers.append([q, [_gformRow[q]]])

    return getSurveysThatAnswered(sample, questionsAndPositiveAnswers, True)

In [ ]:
def getMatchingDemographics(sample, _gformRow, hardPolicy = False):
    # age and gender
    Q4 = 'How old are you?'
    Q5 = 'What is your gender?'

    # interests, hobbies, and knowledge - evaluation may vary after playing
    Q2Interest = 'Are you interested in video games?'
    Q3Play = 'Do you play video games?'
    Q6BioEdu = 'How long have you studied biology?'
    Q7BioInterest = 'Are you interested in biology?'
    Q8SynBio = 'Before playing Hero.Coli, had you ever heard about synthetic biology?'
    Q9BioBricks = 'Before playing Hero.Coli, had you ever heard about BioBricks?'

    # language may vary: players may have missed the opportunity to set it, or may want to try and change it
    Q42 = 'Language'

    return getSurveysWithMatchingAnswers(
    sample, 
    _gformRow, [Q4, Q5], 
    extendedList = [Q2Interest, Q3Play, Q6BioEdu, Q8SynBio, Q9BioBricks, Q42], 
    hardPolicy = hardPolicy
)

Utility functions to sample


In [ ]:
def getDemographicSamples(rootSample):
    samples = [
                [rootSample, 'root sample'],
                [rootSample[rootSample['Language'] == 'en'], 'English'],
                [rootSample[rootSample['Language'] == 'fr'], 'French'],
                [rootSample[rootSample['What is your gender?'] == 'Female'], 'female'],
                [rootSample[rootSample['What is your gender?'] == 'Male'], 'male'],
                [getSurveysOfBiologists(rootSample), 'biologists - strict'],
                [getSurveysOfBiologists(rootSample, False), 'biologists - broad'],
                [getSurveysOfGamers(rootSample), 'gamers - strict'],
                [getSurveysOfGamers(rootSample, False), 'gamers - broad'],
            ]
    return samples

In [ ]:
def getTemporalitySamples(rootSample):
    samples = [
                [rootSample, 'root sample'],
        
                [getRMBefores(rootSample), 'RedMetrics befores'],
                [getGFormBefores(rootSample), 'Google form befores'],
                [getRMBefores(getGFormBefores(rootSample)), 'GF & RedMetrics befores'],
        
                [getRMAfters(rootSample), 'RedMetrics afters'],
                [getGFormAfters(rootSample), 'Google form afters'],
                [getRMAfters(getGFormAfters(rootSample)), 'GF & RedMetrics afters'],
        
                [getSurveysOfUsersWhoAnsweredBoth(rootSample, gfMode = True, rmMode = False), 'GF both before and after'],
                [getSurveysOfUsersWhoAnsweredBoth(rootSample, gfMode = False, rmMode = True), 'RM both before and after'],
                [getSurveysOfUsersWhoAnsweredBoth(rootSample, gfMode = True, rmMode = True), 'GF & RM both before and after'],
            ]
    return samples

checkpoint validation


In [ ]:
#function that returns the list of checkpoints from user id
def getValidatedCheckpoints( userId, _form = gform ):
    _validatedCheckpoints = []
    
    if hasAnswered( userId, _form = _form ):
        _columnAnswers = getCorrections( userId, _form = _form)
        
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:        
                _questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(len(checkpointQuestionMatching)))

                for _index in range(0, len(_questionnaireValidatedCheckpointsPerQuestion)):
                    if _columnAnswers[_columnName][_index]==True:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = checkpointQuestionMatching['checkpoint'][_index]
                    else:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = ''

                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpointsPerQuestion.unique()
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints[_questionnaireValidatedCheckpoints!='']
                _questionnaireValidatedCheckpoints = pd.Series(_questionnaireValidatedCheckpoints)
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints.sort_values()
                _questionnaireValidatedCheckpoints.index = range(0, len(_questionnaireValidatedCheckpoints))
                
                _validatedCheckpoints.append(_questionnaireValidatedCheckpoints) 
    else:
        print("user " + str(userId) + " has never answered")
    return pd.Series(_validatedCheckpoints)

def getValidatedCheckpointsCounts( _userId, _form = gform ):
    _validatedCheckpoints = getValidatedCheckpoints(_userId, _form = _form)
    _counts = []
    for checkpointsList in _validatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

def getNonValidated( checkpoints ):
    _validationLists = []
    
    if 0!=len(checkpoints):
        for _validation in checkpoints:
            _result = pd.Series(np.setdiff1d(validableCheckpoints.values, _validation.values))
            _result = _result[_result != '']
            _result.index = range(0, len(_result))
            _validationLists.append(_result)
        return pd.Series(_validationLists)
    else:
        return validableCheckpoints

def getNonValidatedCheckpoints( userId, _form = gform ):
    validated = getValidatedCheckpoints( userId, _form = _form )
    return getNonValidated(validated)

def getNonValidatedCheckpointsCounts( userId, _form = gform ):
    _nonValidatedCheckpoints = getNonValidatedCheckpoints(userId, _form = _form)
    _counts = []
    for checkpointsList in _nonValidatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

p(answered question N | answered question P)


In [ ]:
# returns all rows of Google form's answers that contain an element 
#   of the array 'choice' for question number 'questionIndex'
def getAllAnswerRows(questionIndex, choice, _form = gform ):
    return _form[_form.iloc[:, questionIndex].isin(choice)]

def getPercentCorrectPerColumn(_df):
    _count = len(_df)
    _percents = pd.Series(np.full(len(_df.columns), np.nan), index=_df.columns)
    for _rowIndex in _df.index:
        for _columnName in _df.columns:
            _columnIndex = _df.columns.get_loc(_columnName)
            if ((_columnIndex >= firstEvaluationQuestionIndex) \
                and (_columnIndex < len(_df.columns)-3)):
                if(str(_df[_columnName][_rowIndex]).startswith(str(correctAnswers[_columnIndex]))):
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 1;
                    else:
                        _percents[_columnName] = _percents[_columnName]+1
                else:
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 0;
                
    _percents = _percents/_count
    _percents['Count'] = _count
    return _percents

def getPercentCorrectKnowingAnswer(questionIndex, choice, _form = gform):
    _answerRows = getAllAnswerRows(questionIndex, choice, _form = _form);
    return getPercentCorrectPerColumn(_answerRows)

Filtering users


In [ ]:
def getTestAnswers( _form = gform, _rmDF = rmdf152, _rmTestDF = normalizedRMDFTest, includeAndroid = True):
    return _form[_form[localplayerguidkey].isin(testUsers.values.flatten())]

Initialization of gform


In [ ]:
setAnswerTemporalities()