Hero.Coli Data Analysis Summary

List of readworthy results from Hero.Coli data analysis.

Preparation


In [ ]:
%run "../Functions/6. Time analysis.ipynb"
%run "../Utilities/Plot.ipynb"

1. Google form analysis

Survey counts


In [ ]:
print("surveys:              %s" % len(gform))
print("unique users:         %s" % getUniqueUserCount(gform))
print("RM before:            %s" % len(gform[gform['Temporality'] == 'before']))
print("GF before:            %s" % len(getGFormBefores(gform)))
print("RM after:             %s" % len(gform[gform['Temporality'] == 'after']))
print("GF after:             %s" % len(getGFormAfters(gform)))
print("unique biologists:    %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers:        %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users: %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))

formatted version for nice display


In [ ]:
print("category | count")
print("--- | ---")
print("surveys | %s" % len(gform))
print("unique users | %s" % getUniqueUserCount(gform))
print("RM before | %s" % len(gform[gform['Temporality'] == 'before']))
print("GF before | %s" % len(getGFormBefores(gform)))
print("RM after | %s" % len(gform[gform['Temporality'] == 'after']))
print("GF after | %s" % len(getGFormAfters(gform)))
print("unique biologists | %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers | %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users | %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))
print()
#print("(" + str(pd.to_datetime('today').date()) + ")")
print("("+dataFilesNamesStem+")")

In [ ]:
pd.to_datetime('today')

1.1 complete sample


In [ ]:
plotSamples(getDemographicSamples(gform))

In [ ]:
plotSamples(getTemporalitySamples(gform))

1.2 Per temporality

1.2.1 answered only before


In [ ]:
gf_befores = getGFormBefores(gform)
rm_befores = getRMBefores(gform)
gfrm_befores = getRMBefores(getGFormBefores(gform))

In [ ]:
plotSamples(getDemographicSamples(gf_befores))

1.2.2 answered only after


In [ ]:
gf_afters = getGFormAfters(gform)
rm_afters = getRMAfters(gform)
gfrm_afters = getRMAfters(getGFormBefores(gform))

In [ ]:
plotSamples(getDemographicSamples(gf_afters))

1.2.3 answered both before and after


In [ ]:
gf_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = False)
rm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = False, rmMode = True)
gfrm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = True)

In [ ]:
plotSamples(getDemographicSamples(gf_both))

In [ ]:
plotSamples(getDemographicSamples(rm_both))

In [ ]:
plotSamples(getDemographicSamples(gfrm_both))

1.3 Per demography

1.3.1 English speakers


In [ ]:
cohortEN = gform[gform['Language'] == 'en']

In [ ]:
plotSamples(getTemporalitySamples(cohortEN))

1.3.2 French speakers


In [ ]:
cohortFR = gform[gform['Language'] == 'fr']

In [ ]:
plotSamples(getTemporalitySamples(cohortFR))

1.3.3 Female


In [ ]:
cohortF = gform[gform['What is your gender?'] == 'Female']

In [ ]:
plotSamples(getTemporalitySamples(cohortF))

1.3.4 Male


In [ ]:
cohortM = gform[gform['What is your gender?'] == 'Male']

In [ ]:
plotSamples(getTemporalitySamples(cohortM))

1.3.5 biologists

strict

In [ ]:
cohortBioS = getSurveysOfBiologists(gform)

In [ ]:
plotSamples(getTemporalitySamples(cohortBioS))
broad

In [ ]:
cohortBioB = getSurveysOfBiologists(gform, False)

In [ ]:
plotSamples(getTemporalitySamples(cohortBioB))

1.3.6 gamers

strict

In [ ]:
cohortGamS = getSurveysOfGamers(gform)

In [ ]:
plotSamples(getTemporalitySamples(cohortGamS))
broad

In [ ]:
cohortGamB = getSurveysOfGamers(gform, False)

In [ ]:
plotSamples(getTemporalitySamples(cohortGamB))

1.4 answered only after

1.1 answers to scientific questions


In [ ]:
sciBinarizedBefore = getAllBinarized(_form = getRMBefores(gform))
#sciBinarizedBefore = getAllBinarized(getGFBefores())

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        sciBinarizedBefore,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlations on survey questions before',
                    )

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
thisClustermap, overlay = plotCorrelationMatrix(
                        sciBinarizedBefore,
                        _abs=True,
                        _clustered=True,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _metric='correlation'
                    )

In [ ]:
sciBinarizedAfter = getAllBinarized(_form = getRMAfters(gform))

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        sciBinarizedAfter,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlations on survey questions after',
                    )

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
thisClustermap, overlay = plotCorrelationMatrix(
                        sciBinarizedAfter,
                        _abs=True,
                        _clustered=True,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _metric='correlation'
                    )

thisClustermap.ax_heatmap.annotate(overlay)

dir(thisClustermap)

dir(thisClustermap.ax_heatmap)

vars(thisClustermap)

vars(thisClustermap.ax_heatmap)

1.2 answers to all questions


In [ ]:
allQuestions = correctAnswers + demographicAnswers
allBinarizedBefore = getAllBinarized(_source = allQuestions, _form = getRMBefores(gform))
allBinarizedAfter = getAllBinarized(_source = allQuestions, _form = getRMAfters(gform))

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allBinarizedBefore,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers before',
                    )

In [ ]:
plotCorrelationMatrix(
                        allBinarizedAfter,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers after',
                    )

In [ ]:
thisClustermap, overlay = plotCorrelationMatrix(
                        allBinarizedAfter,
                        _abs=True,
                        _clustered=True,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _metric='correlation'
                    )

1.3 answers to all questions, only before having played


In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allBinarizedBefore,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlations on all questions before',
                    )

In [ ]:
thisClustermap, overlay = plotCorrelationMatrix(
                        allBinarizedBefore,
                        _abs=True,
                        _clustered=True,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _metric='correlation'
                    )

1.4 answers to all questions, only after having played


In [ ]:

2. Game sessions


In [ ]:
valuesPerDay = rmdf152['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='RedMetrics events', startDate=minimum152Date, endDate=maximum152Date)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
valuesPerDay = rmdf152[rmdf152['type'] == 'start']['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='sessions', startDate=minimum152Date, endDate=maximum152Date)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
valuesPerDay = rmdf152.groupby('userId').agg({ "userTime": np.min })['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='game users', startDate=minimum152Date, endDate=maximum152Date)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
valuesPerDay = gform.groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='survey answers', startDate=minimum152Date, endDate=maximum152Date)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
beforesPerDay = gform[gform['Temporality'] == 'before'].groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()
aftersPerDay = gform[gform['Temporality'] == 'after'].groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()
undefinedPerDay = gform[gform['Temporality'] == 'undefined'].groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()

plotPerDay(beforesPerDay, title='survey befores', startDate=minimum152Date, endDate=maximum152Date)
plotPerDay(aftersPerDay, title='survey afters', startDate=minimum152Date, endDate=maximum152Date)
plotPerDay(undefinedPerDay, title='survey undefined', startDate=minimum152Date, endDate=maximum152Date)

In [ ]:

3. Per session and per user analysis

4. User comparison

to do: transfer part of 1.3's "'Google form analysis' functions tinkering" code here

percentagesCrossCorrect


In [ ]:
def getPercentageCrossCorrect(binarized, figsize=(40,100)):
    
    cbar_kws = dict(orientation= "horizontal")
    #cbar_kws = dict(orientation= "horizontal",location="top")
    #cbar_kws = dict(orientation= "horizontal", position="top")
    
    intermediaryNumerator = getCrossCorrectAnswers(binarized).round().astype(int)*100
    percentagesCrossCorrect = (intermediaryNumerator / binarized.shape[0]).round().astype(int)
    _fig = plt.figure(figsize=figsize)
    _ax = plt.subplot(121)
    _ax.set_title('percentage correct')
    sns.heatmap(percentagesCrossCorrect,ax=_ax,cmap=plt.cm.jet,square=True,annot=True,fmt='d',cbar_kws=cbar_kws)
    
    totalPerQuestion = np.dot(np.ones(binarized.shape[0]), binarized)
    percentagesConditionalCrossCorrect = (intermediaryNumerator / totalPerQuestion).round().astype(int).fillna(0)
    _ax = plt.subplot(122)
    _ax.set_title('percentage correct, conditionnally: p(y | x)')
    sns.heatmap(percentagesConditionalCrossCorrect,ax=_ax,cmap=plt.cm.jet,square=True,annot=True,fmt='d',cbar_kws=cbar_kws)
    
    plt.tight_layout()

In [ ]:
getPercentageCrossCorrect(sciBinarizedBefore, figsize=(40,100))

In [ ]:
getPercentageCrossCorrect(sciBinarizedAfter, figsize=(40,100))

In [ ]:
# small sample
#allData = getAllUserVectorData( getAllUsers( rmdf152 )[:10] )

# complete set
#allData = getAllUserVectorData( getAllUsers( rmdf152 ) )

# subjects who answered the gform
allData = getAllUserVectorData( getAllResponders(), _source = correctAnswers )

# 10 subjects who answered the gform
#allData = getAllUserVectorData( getAllResponders()[:10] )

In [ ]:
plotAllUserVectorDataCorrelationMatrix(allData.T, _abs=True, _figsize = (40,40))

In [ ]:
len(allData.index)

In [ ]:
#allBinarized

5. Game map

Player filtering


In [ ]:
#players = rmdf152.loc[:, playerFilteringColumns]
players = safeGetNormalizedRedMetricsCSV( rmdf152 )
players.head(1)

In [ ]:
#players = players.dropna(how='any')
#players.head(1)
#rmdf152.head(1)

In [ ]:
players.shape[0]

In [ ]:
#players = players[~players['userId'].isin(excludedIDs)];
#players.shape[0]

Sessions (filtered)


In [ ]:
sessionscount = players["sessionId"].nunique()
sessionscount

Sessions of dev IDs


In [ ]:

Unique players


In [ ]:
uniqueplayers = players['userId']
uniqueplayers = uniqueplayers.unique()
uniqueplayers.shape[0]

In [ ]:
#uniqueplayers

Unique platforms


In [ ]:
uniqueplatforms = players['customData.platform'].unique()
uniqueplatforms

Checkpoints passed / furthest checkpoint (unfiltered)


In [ ]:
checkpoints = rmdf152.loc[:, checkpointsRelevantColumns]
checkpoints = checkpoints[checkpoints['type']=='reach'].loc[:,['section','sessionId']]
checkpoints = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
checkpoints = checkpoints.groupby("sessionId")
checkpoints = checkpoints.max()
checkpoints.head()

In [ ]:
maxCheckpointTable = pd.DataFrame({"maxCheckpoint" : checkpoints.values.flatten()})
maxCheckpointCounts = maxCheckpointTable["maxCheckpoint"].value_counts()
maxCheckpointCounts['Start'] = None
maxCheckpointCounts = maxCheckpointCounts.sort_index()
print('\nmaxCheckpointCounts=\n{0}'.format(str(maxCheckpointCounts)))

In [ ]:
maxCheckpointCountsTable = pd.DataFrame({"maxCheckpoint" : maxCheckpointCounts.values})
maxCheckpointCountsTableCount = maxCheckpointCountsTable.sum(0)[0]
maxCheckpointCountsTableCount

In [ ]:
checkpoints.count()

In [ ]:
maxCheckpointCountsTable.head()

In [ ]:
maxCheckpointCountsTable.describe()

In [ ]:
genericTreatment( maxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, maxCheckpointCountsTableCount, False, True )

Session starts


In [ ]:
#starts = rmdf152.loc[:, checkpointsRelevantColumns]
#starts = checkpoints[checkpoints['type']=='start'].loc[:,['playerId']]
#starts = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
#starts = checkpoints.groupby("playerId")
#starts = checkpoints.max()
#starts.head()

In [ ]:
startTutorial1Count = sessionscount
neverReachedGameSessionCount = startTutorial1Count - maxCheckpointCountsTableCount
fullMaxCheckpointCounts = maxCheckpointCounts
fullMaxCheckpointCounts['Start'] = neverReachedGameSessionCount
fullMaxCheckpointCountsTable = pd.DataFrame({"fullMaxCheckpoint" : fullMaxCheckpointCounts.values})

genericTreatment( fullMaxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, startTutorial1Count, False, True )

print('\nfullMaxCheckpointCountsTable=\n{0}'.format(fullMaxCheckpointCountsTable))
fullMaxCheckpointCountsTable.describe()

Duration

Duration of playing sessions


In [ ]:
durations = players.groupby("sessionId").agg({ "serverTime": [ np.min, np.max  ] })
durations["duration"] = pd.to_datetime(durations["serverTime"]["amax"]) - pd.to_datetime(durations["serverTime"]["amin"])
durations["duration"] = durations["duration"].map(lambda x: np.timedelta64(x, 's'))
durations = durations.sort_values(by=['duration'], ascending=[False])
durations.head()

Duration plot


In [ ]:
durations.loc[:,'duration']
durations = durations[4:]
durations["duration_seconds"] = durations["duration"].map(lambda x: pd.Timedelta(x).seconds)
maxDuration = np.max(durations["duration_seconds"])
durations["duration_rank"] = durations["duration_seconds"].rank(ascending=False)
durations.plot(x="duration_rank", y="duration_seconds")
plt.xlabel("game session")
plt.ylabel("time played (s)")
plt.legend('')
plt.xlim(0, sessionscount)
plt.ylim(0, maxDuration)
durations["duration_seconds"].describe()
durations.head()