```
In [1]:
```import pandas as pd
import urllib
import time
import json

```
In [116]:
```studentBehavior = pd.read_json("studentBehaviorInfoOver40Class_1213.json", orient="index")
csv_file = pd.read_csv("clusterVideoWordLevel3cluster.csv",header=None)

```
In [34]:
# print(studentBehavior[0])
# print studentBehavior[0]["memberId"]
# print (studentBehavior[0]["chosenVideo"])
# print (studentBehavior[0]["vocabularyList"])
print(video_to_cluster)
## Desired json structure
# [
# {
# "memberId": 12345,
# "videoSequence":
# [
# 3980,
# 7680,
# 1235
# ],
# "skippingRatioSequence":
# [
# 0,
# 0,
# 0.3
# ],
# "scoreSequence":
# [
# 80,
# 70,
# 75
# ],
# "dictionarySequence":
# [
# 0,
# 0,
# 0
# ],
# "skippingIndex": 10,
# "clusterSequence":
# [
# "C1",
# "C1",
# "C2"
# ]
# }
# ]

```
```13354

```
In [117]:
```# Generate Users Profile File
# This method return a mapping of video(key)
# and and array of scores(value)
def parse_video_scores(scores_object):
result = {}
for score in scores_object:
if score['postId'] not in result:
result[score['postId']] = []
result[score['postId']].append(score['score'])
return result
def parse_video_words(words_object):
result = {}
for word in words_object:
if word['postId'] not in result:
result[word['postId']] = []
result[word['postId']].append(word['word'])
return result
def parse_csv_file(csv_file):
result = {}
for index in range(1,len(csv_file[0])):
result[csv_file[0][index]] = int(csv_file[1][index])
return result
def get_cluster_cardinality(videos_cluster):
result = {}
for video in videos_cluster:
if videos_cluster[video] in result:
# Increase
result[videos_cluster[video]] += 1
else:
result[videos_cluster[video]] = 1
return result
def calc_skipping_ratio(scores):
countSkipped = 0
for score in scores:
if score < 0:
countSkipped += 1
return round(countSkipped*1.0/len(scores)*1.0,2)
def calc_avg_score(scores):
countSkipped = 0
sumScores = 0
for score in scores:
if score < 0:
countSkipped += 1
else:
sumScores += score
if countSkipped > 0:
return 0
else:
return round(sumScores*1.0/(len(scores)*1.0-countSkipped*1.0),2)
lenStudentBehavior = len(pd.read_json("studentBehaviorInfoOver40Class_1213.json"))
usersProfilesJsonWithIndexes = {}
video_to_cluster = parse_csv_file(csv_file)
cardinality = get_cluster_cardinality(video_to_cluster)
print(cardinality)
for index in range(lenStudentBehavior):
memberId = studentBehavior[index]["memberId"]
if memberId not in usersProfilesJsonWithIndexes:
studentProfile = {}
studentProfile["memberId"] = memberId
studentProfile["videoSequence"] = []
studentProfile["skippingRatioSequence"] = []
studentProfile["avgScoreSequence"] = []
studentProfile["dictionarySequence"] = []
studentProfile["skippingIndex"] = -1
studentProfile["clusterSequence"] = []
studentProfile["scores"] = []
studentProfile["interaction"] = {}
usersProfilesJsonWithIndexes[memberId] = studentProfile
# Addid values for every key
# Video Sequence
usersProfilesJsonWithIndexes[memberId]["videoSequence"].extend(studentBehavior[index]["chosenVideo"])
videos_scores = parse_video_scores(studentBehavior[index]["listenScore"])
video_words = parse_video_words(studentBehavior[index]["vocabularyList"])
# SkippingRationSequence
for video in studentBehavior[index]["chosenVideo"]:
scores = videos_scores[video]
usersProfilesJsonWithIndexes[memberId]["scores"].append(scores)
usersProfilesJsonWithIndexes[memberId]["skippingRatioSequence"].append(calc_skipping_ratio(scores))
# Average Score Sequence
avg = calc_avg_score(scores)
usersProfilesJsonWithIndexes[memberId]["avgScoreSequence"].append(avg)
# DictionarySequence
if str(video) in video_words:
words = video_words[str(video)]
usersProfilesJsonWithIndexes[memberId]["dictionarySequence"].append(len(words))
else:
usersProfilesJsonWithIndexes[memberId]["dictionarySequence"].append(0)
# ClusterSequece
usersProfilesJsonWithIndexes[memberId]["clusterSequence"].append(video_to_cluster[str(video)])
# skippingIndex
skip_index = [ n for n,i in enumerate(usersProfilesJsonWithIndexes[memberId]["skippingRatioSequence"]) if i>0.0 ][0]
usersProfilesJsonWithIndexes[memberId]["skippingIndex"] = skip_index
# Add cluster turning point
usersProfilesJsonWithIndexes[memberId]["tp"] = usersProfilesJsonWithIndexes[memberId]["clusterSequence"][skip_index]
# Calculate Interaction
# Sum scores by cluster
scores_by_cluster = {}
for c_index in range(len(usersProfilesJsonWithIndexes[memberId]["clusterSequence"])):
cluster = usersProfilesJsonWithIndexes[memberId]["clusterSequence"][c_index]
if cluster in scores_by_cluster:
scores_by_cluster[cluster] += usersProfilesJsonWithIndexes[memberId]["avgScoreSequence"][c_index]
else:
scores_by_cluster[cluster] = usersProfilesJsonWithIndexes[memberId]["avgScoreSequence"][c_index]
# Add cluster with no interaction
for cluster in cardinality:
if cluster not in scores_by_cluster:
scores_by_cluster[cluster] = 0
# Divide sum by cardinality of the cluster
for cluster in scores_by_cluster:
scores_by_cluster[cluster] = round((scores_by_cluster[cluster]*1.0) / (cardinality[cluster] * 1.0),2)
usersProfilesJsonWithIndexes[memberId]["interaction"] = scores_by_cluster
print(usersProfilesJsonWithIndexes['50679'])
fileStudentProfile = open("studentProfile_wordLevel3cluster.json", "w")
fileStudentProfile.write("[")
lenObjects = len(usersProfilesJsonWithIndexes)
it = 0
for key, value in usersProfilesJsonWithIndexes.items():
json.dump(value,fileStudentProfile)
if it != (lenObjects-1):
fileStudentProfile.write(", ")
it = it + 1
fileStudentProfile.write("]")
fileStudentProfile.close()

```
```{0: 69, 1: 42, 2: 124}
{'avgScoreSequence': [92.0, 93.33, 90.14, 90.17, 93.86, 92.0, 0, 0], 'videoSequence': [3913, 5186, 4974, 4802, 3711, 5881, 7126, 5797], 'memberId': '50679', 'dictionarySequence': [26, 15, 73, 32, 33, 17, 19, 40], 'scores': [[93, 91, 92, 92, 92], [93, 94, 93], [90, 80, 77, 96, 86, 97, 90, 95, 95, 98, 91, 86, 95, 86], [85, 90, 90, 93, 90, 93], [92, 88, 99, 94, 98, 91, 95], [90, 94], [77, 79, -1], [-1, -1, -1, -1, -1]], 'skippingIndex': 6, 'clusterSequence': [0, 0, 0, 2, 0, 0, 2, 2], 'tp': 2, 'interaction': {0: 6.69, 1: 0.0, 2: 0.73}, 'skippingRatioSequence': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33, 1.0]}

```
In [118]:
```# Generate csv file for classifier
interaction = open("users_interaction_wordLevel3cluster.csv", "w")
header = ""
for key in cardinality:
header += "C" + str(key)
header += ","
header += "TP" # class label
header += "\n"
print(header)
interaction.write(header)
for user in usersProfilesJsonWithIndexes:
s = ""
for key in cardinality:
s += str(usersProfilesJsonWithIndexes[user]["interaction"][key])+","
s += "C"+str(usersProfilesJsonWithIndexes[user]["tp"])
s += "\n"
print(s)
interaction.write(s)
interaction.close()

```
```
In [101]:
```# Generate secuence of videos by user
interaction = open("users_video_sequence.csv", "w")
for user in usersProfilesJsonWithIndexes:
s = (','.join(str(x) for x in usersProfilesJsonWithIndexes[user]["videoSequence"][:usersProfilesJsonWithIndexes[user]["skippingIndex"]+1]))
s += "\n"
interaction.write(s)
interaction.close()
# Generate secuence of videos by cluster
interaction = open("users_video_sequence_by_cluster.csv", "w")
for user in usersProfilesJsonWithIndexes:
s = (','.join(str(x) for x in usersProfilesJsonWithIndexes[user]["clusterSequence"][:usersProfilesJsonWithIndexes[user]["skippingIndex"]+1]))
s += "\n"
interaction.write(s)
interaction.close()
# TP videos
interaction = open("tp_videos.csv", "w")
array = []
for user in usersProfilesJsonWithIndexes:
index = usersProfilesJsonWithIndexes[user]["skippingIndex"]
array.append(usersProfilesJsonWithIndexes[user]["videoSequence"][index])
result = list(set(array))
for v in result:
interaction.write(str(v) + "\n")
interaction.close()
# TP clusters
interaction = open("tp_clusters.csv", "w")
array = []
for user in usersProfilesJsonWithIndexes:
index = usersProfilesJsonWithIndexes[user]["skippingIndex"]
array.append(usersProfilesJsonWithIndexes[user]["clusterSequence"][index])
result = list(set(array))
for v in result:
interaction.write(str(v) + "\n")
interaction.close()

