In [ ]:
%run "../Functions/1. Google form analysis.ipynb"
%run "../Functions/4. User comparison.ipynb"
In [ ]:
#getAllResponders()
In [ ]:
setAnswerTemporalities(gform)
In [ ]:
# small sample
#allData = getAllUserVectorData( getAllUsers( rmdf1522 )[:10] )
# complete set
#allData = getAllUserVectorData( getAllUsers( rmdf1522 ) )
# subjects who answered the gform
allData = getAllUserVectorData( getAllResponders() )
# 10 subjects who answered the gform
#allData = getAllUserVectorData( getAllResponders()[:10] )
In [ ]:
efficiencies = allData.loc['efficiency'].sort_values()
efficiencies.index = range(0, len(allData.columns))
efficiencies.plot(title = 'efficiency')
In [ ]:
efficiencies2 = allData.loc['efficiency'].sort_values()
efficiencies2 = efficiencies2[efficiencies2 != 0]
efficiencies2.index = range(0, len(efficiencies2))
efficiencies2 = np.log(efficiencies2)
efficiencies2.plot(title = 'efficiency log')
In [ ]:
maxChapter = allData.loc['maxChapter'].sort_values()
maxChapter.index = range(0, len(allData.columns))
maxChapter.plot(title = 'maxChapter')
In [ ]:
len(allData.columns)
In [ ]:
userIds = getAllResponders()
_source = correctAnswers
In [ ]:
# _source is used as correction source, if we want to include answers to these questions
#def getAllUserVectorData( userIds, _source = [] ):
# result
isInitialized = False
allData = []
f = FloatProgress(min=0, max=len(userIds))
display(f)
for userId in userIds:
#print(str(userId))
f.value += 1
if not isInitialized:
isInitialized = True
allData = getUserDataVector(userId, _source = _source)
else:
allData = pd.concat([allData, getUserDataVector(userId, _source = _source)], axis=1)
#print('done')
allData
In [ ]:
userId
In [ ]:
methods = ['pearson', 'kendall', 'spearman']
_allUserVectorData = allData.T
_method = methods[0]
_title='RedMetrics Correlations'
_abs=True
_clustered=False
_figsize = (20,20)
#def plotAllUserVectorDataCorrelationMatrix(
# _allUserVectorData,
# _method = methods[0],
# _title='RedMetrics Correlations',
# _abs=False,
# _clustered=False,
# _figsize = (20,20)
#):
_progress = FloatProgress(min=0, max=3)
display(_progress)
# computation of correlation matrix
_m = _method
if(not (_method in methods)):
_m = methods[0]
_correlation = _allUserVectorData.astype(float).corr(_m)
_progress.value += 1
if(_abs):
_correlation = _correlation.abs()
_progress.value += 1
# plot
if(_clustered):
sns.clustermap(_correlation,cmap=plt.cm.jet,square=True,figsize=_figsize)
else:
_fig = plt.figure(figsize=_figsize)
_ax = plt.subplot(111)
_ax.set_title(_title)
sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True)
_progress.value += 1
In [ ]:
In [ ]:
In [ ]:
gform[QTemporality].unique()
In [ ]:
allData.loc['scoreundefined'].dropna()
In [ ]:
getAllUsers(rmdf1522)[:10]
In [ ]:
len(getAllUsers(rmdf1522))
In [ ]:
userSessionsRelevantColumns = ['customData.localplayerguid', 'sessionId']
userSessions = rmdf1522[rmdf1522['type']=='start'].loc[:,userSessionsRelevantColumns]
In [ ]:
userSessions = userSessions.rename(index=str, columns={'customData.localplayerguid': 'userId'})
userSessions.head()
In [ ]:
#groupedUserSessions = userSessions.groupby('customData.localplayerguid')
#groupedUserSessions.head()
#groupedUserSessions.describe().head()
In [ ]:
checkpointsRelevantColumns = ['sessionId', 'customData.localplayerguid', 'type', 'section', 'userTime']
checkpoints = rmdf1522.loc[:, checkpointsRelevantColumns]
checkpoints = checkpoints[checkpoints['type']=='reach'].loc[:,['section','sessionId','userTime']]
checkpoints = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
#checkpoints = checkpoints.groupby("sessionId")
#checkpoints = checkpoints.max()
checkpoints.head()
In [ ]:
#assembled = userSessions.combine_first(checkpoints)
assembled = pd.merge(userSessions, checkpoints, on='sessionId', how='outer')
assembled.head()
In [ ]:
userSections = assembled.drop('sessionId', 1)
userSections.head()
In [ ]:
userSections = userSections.dropna()
userSections.head()
In [ ]:
checkpoints = userSections.groupby("userId")
checkpoints = checkpoints.max()
checkpoints.head()
In [ ]:
#userTimedSections = userSections.groupby("userId").agg({ "userTime": np.min })
#userTimedSections = userSections.groupby("userId")
userTimes = userSections.groupby("userId").agg({ "userTime": [np.min, np.max] })
userTimes["duration"] = pd.to_datetime(userTimes["userTime"]["amax"]) - pd.to_datetime(userTimes["userTime"]["amin"])
userTimes["duration"] = userTimes["duration"].map(lambda x: np.timedelta64(x, 's'))
userTimes = userTimes.sort_values(by=['duration'], ascending=[False])
userTimes.head()
userTimes.loc[:,'duration'] userTimes = userTimes[4:] userTimes["duration_seconds"] = userTimes["duration"].map(lambda x: pd.Timedelta(x).seconds) maxDuration = np.max(userTimes["duration_seconds"]) userTimes["duration_rank"] = userTimes["duration_seconds"].rank(ascending=False) userTimes.plot(x="duration_rank", y="duration_seconds") plt.xlabel("game session") plt.ylabel("time played (s)") plt.legend('') plt.xlim(0, 139) plt.ylim(0, maxDuration)
userTimedSections = userSections.groupby("section").agg({ "userTime": np.min }) userTimedSections
userTimedSections["firstReached"] = pd.to_datetime(userTimedSections["userTime"]) userTimedSections.head()
userTimedSections.drop('userTime', 1) userTimedSections.head()
userTimedSections["firstCompletionDuration"] = userTimedSections["firstReached"].diff() userTimedSections.head()
In [ ]:
sessionCount = 1
_rmDF = rmdf1522
sample = gform
before = False
after = True
gfMode = False
rmMode = True
#def getAllUserVectorDataCustom(before, after, gfMode = False, rmMode = True, sessionCount = 1, _rmDF = rmdf1522)
userIds = []
if (before and after):
userIds = getSurveysOfUsersWhoAnsweredBoth(sample, gfMode = gfMode, rmMode = rmMode)
elif before:
if rmMode:
userIds = getRMBefores(sample)
else:
userIds = getGFBefores(sample)
elif after:
if rmMode:
userIds = getRMAfters(sample)
else:
userIds = getGFormAfters(sample)
if(len(userIds) > 0):
userIds = userIds[localplayerguidkey]
allUserVectorData = getAllUserVectorData(userIds, _rmDF = _rmDF)
allUserVectorData = allUserVectorData.T
result = allUserVectorData[allUserVectorData['sessionsCount'] == sessionCount].T
else:
print("no matching user")
result = []
In [ ]:
result
In [ ]:
getAllUserVectorDataCustom(False, True)
In [ ]:
userIdsBoth = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = True)[localplayerguidkey]
allUserVectorData = getAllUserVectorData(userIdsBoth)
allUserVectorData = allUserVectorData.T
allUserVectorData[allUserVectorData['sessionsCount'] == 1]
In [ ]:
testUser = "3685a015-fa97-4457-ad73-da1c50210fe1"
In [ ]:
def getScoreFromBinarized(binarizedAnswers):
gformIndices = binarizedAnswers.index.map(lambda s: int(s.split(correctionsColumnNameStem)[1]))
return pd.Series(np.dot(binarizedAnswers, np.ones(binarizedAnswers.shape[1])), index=gform.loc[gformIndices, localplayerguidkey])
In [ ]:
#allResponders = getAllResponders()
#gf_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = False)
rm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = False, rmMode = True)
#gfrm_both = getSurveysOfUsersWhoAnsweredBoth(gform, gfMode = True, rmMode = True)
sciBinarizedBefore = getAllBinarized(_form = getRMBefores(rm_both))
sciBinarizedAfter = getAllBinarized(_form = getRMAfters(rm_both))
scoresBefore = getScoreFromBinarized(sciBinarizedBefore)
scoresAfter = getScoreFromBinarized(sciBinarizedAfter)
In [ ]:
medianBefore = np.median(scoresBefore)
medianAfter = np.median(scoresAfter)
maxScore = sciBinarizedBefore.shape[1]
In [ ]:
indicators = pd.DataFrame()
indicators[answerTemporalities[0]] = scoresBefore
indicators[answerTemporalities[1]] = scoresAfter
indicators['delta'] = scoresAfter - scoresBefore
indicators['maxPotentialDelta'] = maxScore - scoresBefore
for index in indicators['maxPotentialDelta'].index:
if (indicators.loc[index, 'maxPotentialDelta'] == 0):
indicators.loc[index, 'maxPotentialDelta'] = 1
indicators['relativeBefore'] = scoresBefore / medianBefore
indicators['relativeAfter'] = scoresAfter / medianBefore
indicators['relativeDelta'] = indicators['delta'] / medianBefore
indicators['realizedPotential'] = indicators['delta'] / indicators['maxPotentialDelta']
indicators['increaseRatio'] = indicators[answerTemporalities[0]]
for index in indicators['increaseRatio'].index:
if (indicators.loc[index, 'increaseRatio'] == 0):
indicators.loc[index, 'increaseRatio'] = 1
indicators['increaseRatio'] = indicators['delta'] / indicators['increaseRatio']
In [ ]:
indicators
In [ ]:
(min(indicators['relativeBefore']), max(indicators['relativeBefore'])),\
(min(indicators['relativeDelta']), max(indicators['relativeDelta'])),\
medianBefore,\
np.median(indicators['relativeBefore']),\
np.median(indicators['relativeDelta'])\
In [ ]:
indicatorX = 'relativeBefore'
indicatorY = 'relativeDelta'
def scatterPlotIndicators(indicatorX, indicatorY):
print(indicatorX + ' range: ' + str((min(indicators[indicatorX]), max(indicators[indicatorX]))))
print(indicatorY + ' range: ' + str((min(indicators[indicatorY]), max(indicators[indicatorY]))))
print(indicatorX + ' median: ' + str(np.median(indicators[indicatorX])))
print(indicatorY + ' median: ' + str(np.median(indicators[indicatorY])))
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(indicators[indicatorX], indicators[indicatorY])
plt.xlabel(indicatorX)
plt.ylabel(indicatorY)
# vertical line
plt.plot( [np.median(indicators[indicatorX]), np.median(indicators[indicatorX])],\
[min(indicators[indicatorY]), max(indicators[indicatorY])],\
'k-', lw=2)
# horizontal line
plt.plot( [min(indicators[indicatorX]), max(indicators[indicatorX])],\
[np.median(indicators[indicatorY]), np.median(indicators[indicatorY])],\
'k-', lw=2)
In [ ]:
indicators.columns
In [ ]:
scatterPlotIndicators('relativeBefore', 'relativeDelta')
In [ ]:
scatterPlotIndicators('relativeBefore', 'realizedPotential')
In [ ]:
scatterPlotIndicators('relativeBefore', 'increaseRatio')
In [ ]:
scatterPlotIndicators('relativeBefore', 'relativeAfter')
In [ ]:
scatterPlotIndicators('maxPotentialDelta', 'realizedPotential')
In [ ]: