In [ ]:
%run "../Functions/6. Time analysis.ipynb"
%run "../Utilities/Plot.ipynb"
In [ ]:
print("surveys: %s" % len(gform))
print("unique users: %s" % getUniqueUserCount(gform))
print("RM before: %s" % len(gform[gform['Temporality'] == 'before']))
print("GF before: %s" % len(getGFormBefores(gform)))
print("RM after: %s" % len(gform[gform['Temporality'] == 'after']))
print("GF after: %s" % len(getGFormAfters(gform)))
print("unique biologists: %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers: %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users: %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))
In [ ]:
print("category | count")
print("--- | ---")
print("surveys | %s" % len(gform))
print("unique users | %s" % getUniqueUserCount(gform))
print("RM before | %s" % len(gform[gform['Temporality'] == 'before']))
print("GF before | %s" % len(getGFormBefores(gform)))
print("RM after | %s" % len(gform[gform['Temporality'] == 'after']))
print("GF after | %s" % len(getGFormAfters(gform)))
print("unique biologists | %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers | %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users | %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))
print()
#print("(" + str(pd.to_datetime('today').date()) + ")")
print("("+dataFilesNamesStem+")")
In [ ]:
pd.to_datetime('today')
In [ ]:
plotSamples(getDemographicSamples(gform))
In [ ]:
plotSamples(getTemporalitySamples(gform))
In [ ]:
gf_befores = getGFormBefores(gform)
rm_befores = getRMBefores(gform)
gfrm_befores = getRMBefores(getGFormBefores(gform))
In [ ]:
plotSamples(getDemographicSamples(gf_befores))
In [ ]:
gf_afters = getGFormAfters(gform)
rm_afters = getRMAfters(gform)
gfrm_afters = getRMAfters(getGFormBefores(gform))
In [ ]:
plotSamples(getDemographicSamples(gf_afters))
In [ ]:
gf_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = False)
rm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = False, rmMode = True)
gfrm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = True)
In [ ]:
plotSamples(getDemographicSamples(gf_both))
In [ ]:
plotSamples(getDemographicSamples(rm_both))
In [ ]:
plotSamples(getDemographicSamples(gfrm_both))
In [ ]:
cohortEN = gform[gform['Language'] == 'en']
In [ ]:
plotSamples(getTemporalitySamples(cohortEN))
In [ ]:
cohortFR = gform[gform['Language'] == 'fr']
In [ ]:
plotSamples(getTemporalitySamples(cohortFR))
In [ ]:
cohortF = gform[gform['What is your gender?'] == 'Female']
In [ ]:
plotSamples(getTemporalitySamples(cohortF))
In [ ]:
cohortM = gform[gform['What is your gender?'] == 'Male']
In [ ]:
plotSamples(getTemporalitySamples(cohortM))
In [ ]:
cohortBioS = getSurveysOfBiologists(gform)
In [ ]:
plotSamples(getTemporalitySamples(cohortBioS))
In [ ]:
cohortBioB = getSurveysOfBiologists(gform, False)
In [ ]:
plotSamples(getTemporalitySamples(cohortBioB))
In [ ]:
cohortGamS = getSurveysOfGamers(gform)
In [ ]:
plotSamples(getTemporalitySamples(cohortGamS))
In [ ]:
cohortGamB = getSurveysOfGamers(gform, False)
In [ ]:
plotSamples(getTemporalitySamples(cohortGamB))
In [ ]:
sciBinarizedBefore = getAllBinarized(_form = getRMBefores(gform))
#sciBinarizedBefore = getAllBinarized(getGFBefores())
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
sciBinarizedBefore,
_abs=True,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlations on survey questions before',
)
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
thisClustermap, overlay = plotCorrelationMatrix(
sciBinarizedBefore,
_abs=True,
_clustered=True,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_metric='correlation'
)
In [ ]:
sciBinarizedAfter = getAllBinarized(_form = getRMAfters(gform))
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
sciBinarizedAfter,
_abs=True,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlations on survey questions after',
)
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
thisClustermap, overlay = plotCorrelationMatrix(
sciBinarizedAfter,
_abs=True,
_clustered=True,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_metric='correlation'
)
thisClustermap.ax_heatmap.annotate(overlay)
dir(thisClustermap)
dir(thisClustermap.ax_heatmap)
vars(thisClustermap)
vars(thisClustermap.ax_heatmap)
In [ ]:
allQuestions = correctAnswers + demographicAnswers
allBinarizedBefore = getAllBinarized(_source = allQuestions, _form = getRMBefores(gform))
allBinarizedAfter = getAllBinarized(_source = allQuestions, _form = getRMAfters(gform))
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
allBinarizedBefore,
_abs=True,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlation of all answers before',
)
In [ ]:
plotCorrelationMatrix(
allBinarizedAfter,
_abs=True,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlation of all answers after',
)
In [ ]:
thisClustermap, overlay = plotCorrelationMatrix(
allBinarizedAfter,
_abs=True,
_clustered=True,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_metric='correlation'
)
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
allBinarizedBefore,
_abs=True,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlations on all questions before',
)
In [ ]:
thisClustermap, overlay = plotCorrelationMatrix(
allBinarizedBefore,
_abs=True,
_clustered=True,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_metric='correlation'
)
In [ ]:
In [ ]:
valuesPerDay = rmdf152['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='RedMetrics events', startDate=minimum152Date, endDate=maximum152Date)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
valuesPerDay = rmdf152[rmdf152['type'] == 'start']['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='sessions', startDate=minimum152Date, endDate=maximum152Date)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
valuesPerDay = rmdf152.groupby('userId').agg({ "userTime": np.min })['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='game users', startDate=minimum152Date, endDate=maximum152Date)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
valuesPerDay = gform.groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='survey answers', startDate=minimum152Date, endDate=maximum152Date)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
beforesPerDay = gform[gform['Temporality'] == 'before'].groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()
aftersPerDay = gform[gform['Temporality'] == 'after'].groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()
undefinedPerDay = gform[gform['Temporality'] == 'undefined'].groupby(localplayerguidkey).agg({ "Timestamp": np.min })['Timestamp'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(beforesPerDay, title='survey befores', startDate=minimum152Date, endDate=maximum152Date)
plotPerDay(aftersPerDay, title='survey afters', startDate=minimum152Date, endDate=maximum152Date)
plotPerDay(undefinedPerDay, title='survey undefined', startDate=minimum152Date, endDate=maximum152Date)
In [ ]:
to do: transfer part of 1.3's "'Google form analysis' functions tinkering" code here
In [ ]:
def getPercentageCrossCorrect(binarized, figsize=(40,100)):
cbar_kws = dict(orientation= "horizontal")
#cbar_kws = dict(orientation= "horizontal",location="top")
#cbar_kws = dict(orientation= "horizontal", position="top")
intermediaryNumerator = getCrossCorrectAnswers(binarized).round().astype(int)*100
percentagesCrossCorrect = (intermediaryNumerator / binarized.shape[0]).round().astype(int)
_fig = plt.figure(figsize=figsize)
_ax = plt.subplot(121)
_ax.set_title('percentage correct')
sns.heatmap(percentagesCrossCorrect,ax=_ax,cmap=plt.cm.jet,square=True,annot=True,fmt='d',cbar_kws=cbar_kws)
totalPerQuestion = np.dot(np.ones(binarized.shape[0]), binarized)
percentagesConditionalCrossCorrect = (intermediaryNumerator / totalPerQuestion).round().astype(int).fillna(0)
_ax = plt.subplot(122)
_ax.set_title('percentage correct, conditionnally: p(y | x)')
sns.heatmap(percentagesConditionalCrossCorrect,ax=_ax,cmap=plt.cm.jet,square=True,annot=True,fmt='d',cbar_kws=cbar_kws)
plt.tight_layout()
In [ ]:
getPercentageCrossCorrect(sciBinarizedBefore, figsize=(40,100))
In [ ]:
getPercentageCrossCorrect(sciBinarizedAfter, figsize=(40,100))
In [ ]:
# small sample
#allData = getAllUserVectorData( getAllUsers( rmdf152 )[:10] )
# complete set
#allData = getAllUserVectorData( getAllUsers( rmdf152 ) )
# subjects who answered the gform
allData = getAllUserVectorData( getAllResponders(), _source = correctAnswers )
# 10 subjects who answered the gform
#allData = getAllUserVectorData( getAllResponders()[:10] )
In [ ]:
plotAllUserVectorDataCorrelationMatrix(allData.T, _abs=True, _figsize = (40,40))
In [ ]:
len(allData.index)
In [ ]:
#allBinarized
In [ ]:
#players = rmdf152.loc[:, playerFilteringColumns]
players = safeGetNormalizedRedMetricsCSV( rmdf152 )
players.head(1)
In [ ]:
#players = players.dropna(how='any')
#players.head(1)
#rmdf152.head(1)
In [ ]:
players.shape[0]
In [ ]:
#players = players[~players['userId'].isin(excludedIDs)];
#players.shape[0]
In [ ]:
sessionscount = players["sessionId"].nunique()
sessionscount
In [ ]:
In [ ]:
uniqueplayers = players['userId']
uniqueplayers = uniqueplayers.unique()
uniqueplayers.shape[0]
In [ ]:
#uniqueplayers
In [ ]:
uniqueplatforms = players['customData.platform'].unique()
uniqueplatforms
In [ ]:
checkpoints = rmdf152.loc[:, checkpointsRelevantColumns]
checkpoints = checkpoints[checkpoints['type']=='reach'].loc[:,['section','sessionId']]
checkpoints = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
checkpoints = checkpoints.groupby("sessionId")
checkpoints = checkpoints.max()
checkpoints.head()
In [ ]:
maxCheckpointTable = pd.DataFrame({"maxCheckpoint" : checkpoints.values.flatten()})
maxCheckpointCounts = maxCheckpointTable["maxCheckpoint"].value_counts()
maxCheckpointCounts['Start'] = None
maxCheckpointCounts = maxCheckpointCounts.sort_index()
print('\nmaxCheckpointCounts=\n{0}'.format(str(maxCheckpointCounts)))
In [ ]:
maxCheckpointCountsTable = pd.DataFrame({"maxCheckpoint" : maxCheckpointCounts.values})
maxCheckpointCountsTableCount = maxCheckpointCountsTable.sum(0)[0]
maxCheckpointCountsTableCount
In [ ]:
checkpoints.count()
In [ ]:
maxCheckpointCountsTable.head()
In [ ]:
maxCheckpointCountsTable.describe()
In [ ]:
genericTreatment( maxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, maxCheckpointCountsTableCount, False, True )
In [ ]:
#starts = rmdf152.loc[:, checkpointsRelevantColumns]
#starts = checkpoints[checkpoints['type']=='start'].loc[:,['playerId']]
#starts = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
#starts = checkpoints.groupby("playerId")
#starts = checkpoints.max()
#starts.head()
In [ ]:
startTutorial1Count = sessionscount
neverReachedGameSessionCount = startTutorial1Count - maxCheckpointCountsTableCount
fullMaxCheckpointCounts = maxCheckpointCounts
fullMaxCheckpointCounts['Start'] = neverReachedGameSessionCount
fullMaxCheckpointCountsTable = pd.DataFrame({"fullMaxCheckpoint" : fullMaxCheckpointCounts.values})
genericTreatment( fullMaxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, startTutorial1Count, False, True )
print('\nfullMaxCheckpointCountsTable=\n{0}'.format(fullMaxCheckpointCountsTable))
fullMaxCheckpointCountsTable.describe()
Duration of playing sessions
In [ ]:
durations = players.groupby("sessionId").agg({ "serverTime": [ np.min, np.max ] })
durations["duration"] = pd.to_datetime(durations["serverTime"]["amax"]) - pd.to_datetime(durations["serverTime"]["amin"])
durations["duration"] = durations["duration"].map(lambda x: np.timedelta64(x, 's'))
durations = durations.sort_values(by=['duration'], ascending=[False])
durations.head()
Duration plot
In [ ]:
durations.loc[:,'duration']
durations = durations[4:]
durations["duration_seconds"] = durations["duration"].map(lambda x: pd.Timedelta(x).seconds)
maxDuration = np.max(durations["duration_seconds"])
durations["duration_rank"] = durations["duration_seconds"].rank(ascending=False)
durations.plot(x="duration_rank", y="duration_seconds")
plt.xlabel("game session")
plt.ylabel("time played (s)")
plt.legend('')
plt.xlim(0, sessionscount)
plt.ylim(0, maxDuration)
durations["duration_seconds"].describe()
durations.head()