In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import scipy
import scipy.sparse as sp
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import re
import networkx as nx
import itertools
from pygsp import graphs, filters, plotting
from sklearn.cluster import KMeans
from sklearn import metrics
import pickle
import os
from collections import Counter
from sklearn import mixture
%load_ext autoreload
%autoreload 2
plt.rcParams['figure.figsize'] = (17, 5)
plotting.BACKEND = 'matplotlib'
In [2]:
weights_wt_students=pd.read_pickle("Graphs/students_graph_STI.pkl")
weights_wt_section=pd.read_pickle("Graphs/section_graph_STI.pkl")
weights_wrt_prof=pd.read_pickle("Graphs/prof_graph_STI.pkl")
weights_wrt_assistants=pd.read_pickle("Graphs/assistants_graph_STI.pkl")
weight_requirements_same_course=pd.read_pickle("Graphs/req_course_same_req_graph_STI.pkl")
weight_course_to_requirement=pd.read_pickle("Graphs/req_course_to_req_graph_STI.pkl")
weight_course_same_requirements=pd.read_pickle("Graphs/req_same_course_graph_STI.pkl")
enrol2=pd.read_pickle("../data/cleaned_enrol_STI.pickle")
Courses=enrol2['CourseCodes'].unique()
courses=pd.read_pickle("../data/cleaned_courses_STI.pickle")
courses2=courses[courses.index.isin(Courses)]
courses_index_dico=dict(zip(Courses, np.arange(len(Courses))))
years=pd.read_pickle("Graphs/years.pkl")
In [3]:
#np.fill_diagonal(weights_wt_students,0)
# Display the graph:
G1s=graphs.Graph(weights_wt_students)
G1s.set_coordinates('spring')
G1s.plot()
In [4]:
laplacian_1=G1s.compute_laplacian("normalized") #compute the Laplacian of the graph
G1s.compute_fourier_basis(recompute=True)
# Display the eigenvalues:
plt.plot(G1s.e)
plt.title('Eigenvalues (normalized Laplacian)')
plt.xlabel('Eigenvalue index')
plt.ylabel('Eigenvalue')
plt.savefig('Graphs/Graph Screenshots/kmeans_eigvals_normLap.png', format='png', dpi=100)
eig_1=G1s.e
eig_1[0:50]
Out[4]:
In [5]:
plt.plot(eig_1[0:100], 'o')
plt.title('Eigenvalues (normalized Laplacian)')
plt.xlabel('Eigenvalue index')
plt.ylabel('Eigenvalue')
Out[5]:
The eigenvalues are smaller than 2 -> ok since normalized Laplacian.
We can see the largest gap of eigenvalues is after the first eigenvalue. According to the lecture, it suggests k=1 (k=1 cluster) for K-means. But here, we do not want only one cluster and we should not have only one. We could try other values for k with respect to the other large gaps but because its choice is not obvious, we will use other methods in the following to get the value of k.
In [6]:
np.diff(G1s.e[0:50]) # differences between eigenvalues
Out[6]:
In [7]:
#compute the "silhouette" to get the k:
S=[]
kmax=20
for k in range(2,kmax):
H1=G1s.U[:,:k]
D_minus_half_1=np.diag(np.sum(weights_wt_students,1)**(-1/2)) # weighted degrees
F1=D_minus_half_1@H1
kmeans_1 = KMeans(n_clusters=k, random_state=0).fit(F1)
s=0
labels_1 = kmeans_1.labels_
centroids=kmeans_1.cluster_centers_
r=np.arange(k)
for course in range(len(Courses)):
dist=scipy.spatial.distance.cdist(centroids,[F1[course,:]], 'euclidean')
a=dist[labels_1[course]][0]
b=min(dist[r!=labels_1[course]])[0]
s+=(b-a)/max(a,b)
s/=len(Courses)
S.append(s)
# Display the silhouette:
plt.plot(range(2,kmax),S)
plt.title('Silhouette')
plt.xlabel('k')
plt.ylabel('S')
plt.savefig('Graphs/Graph Screenshots/kmeans_silhouette.png', format='png', dpi=100)
k=np.argmax(S)+2
print(k)
In [8]:
# Keep the first k eigenvectors:
H1=G1s.U[:,:k]
D_minus_half_1=np.diag(np.sum(weights_wt_students,1)**(-1/2)) # weighted degrees
F1_K=D_minus_half_1@H1
In [9]:
# Apply k-means:
kmeans_1 = KMeans(n_clusters=k, random_state=0).fit(F1_K)
labels_1 = kmeans_1.labels_
In [10]:
G1s.plot_signal(labels_1)
In [11]:
# Display all the obtained clusters:
for i in range(k):
print('\n')
ind1=np.where(labels_1==i)[0]
print(Courses[ind1])
print([courses2[courses2.index.str.endswith(Courses[ind1][p])].CourseTitleFR.tolist()[0] for p in range(len(ind1))])
print('\n')
In [12]:
# Display the graph with respect to the clusters assignements:
values=[100*labels_1[node] for node in range(len(labels_1))]
G1=nx.from_numpy_matrix(weights_wt_students)
plt.figure(1,figsize=(16,10))
pos1 = nx.spring_layout(G1)
cmap=plt.get_cmap('jet')
nx.draw_networkx_nodes(G1, pos1, cmap=cmap, node_color = values) # plot the nodes
nx.draw_networkx_labels(G1, pos1) # plot the labels
nx.draw_networkx_edges(G1, pos1) # plot the edges
sm = plt.cm.ScalarMappable(cmap=cmap, norm=matplotlib.colors.Normalize(vmin=0, vmax=(k-1)))
sm._A = []
plt.colorbar(sm)
plt.title('Graph clustered with Kmeans')
plt.savefig('Graphs/Graph Screenshots/kmeans_clustering.png', format='png', dpi=100)
plt.show()
Then, we tried to use GMM instead of K-means in order to allow elliptical distributions and also soft clustering.
In [13]:
# To determine the number of Gaussians, we compute the AIC and the BIC:
AIC=[]
BIC=[]
for k in range(1,40):
H1=G1s.U[:,:k]
D_minus_half_1=np.diag(np.sum(weights_wt_students,1)**(-1/2)) # weighted degrees
F1=D_minus_half_1@H1
g = mixture.GaussianMixture(n_components=k)
g.fit(F1)
labels_1b=g.predict(F1)
AIC.append(g.aic(F1))
BIC.append(g.bic(F1))
plt.plot(range(1,40),AIC,'r')
plt.title('AIC')
plt.xlabel('k')
plt.ylabel('AIC')
plt.savefig('Graphs/Graph Screenshots/GMM_hard_AIC.png', format='png', dpi=100)
plt.show()
plt.plot(BIC,'b')
plt.title('BIC')
plt.xlabel('k')
plt.ylabel('BIC')
plt.savefig('Graphs/Graph Screenshots/GMM_hard_BIC.png', format='png', dpi=100)
plt.show()
In [14]:
k=np.argmin(BIC)+1
print(k)
# Keep the first k eigenvectors:
H1_gmm_h=G1s.U[:,:k]
D_minus_half_1_gmm_h=np.diag(np.sum(weights_wt_students,1)**(-1/2)) # weighted degrees
F1_gmm_h=D_minus_half_1_gmm_h@H1_gmm_h
# Apply GMM and do the prediction:
g = mixture.GaussianMixture(n_components=k)
g.fit(F1_gmm_h)
labels_1c=g.predict(F1_gmm_h)
In [15]:
# Display all the obtained clusters:
for i in range(k):
print('\n')
ind1=np.where(labels_1c==i)[0]
print(Courses[ind1])
print([courses2[courses2.index.str.endswith(Courses[ind1][p])].CourseTitleFR.tolist()[0] for p in range(len(ind1))])
print('\n')
In [16]:
# Display the graph with respect to the clusters assignements:
values=[100*labels_1c[node] for node in range(len(labels_1c))]
G1=nx.from_numpy_matrix(weights_wt_students)
plt.figure(1,figsize=(16,10))
pos1 = nx.spring_layout(G1)
cmap=plt.get_cmap('jet')
nx.draw_networkx_nodes(G1, pos1, cmap=cmap, node_color = values) # plot the nodes
nx.draw_networkx_labels(G1, pos1) # plot the labels
nx.draw_networkx_edges(G1, pos1) # plot the edges
sm = plt.cm.ScalarMappable(cmap=cmap, norm=matplotlib.colors.Normalize(vmin=0, vmax=(k-1)))
sm._A = []
plt.colorbar(sm)
plt.title('Graph clustered with GMM (hard clustering)')
plt.savefig('Graphs/Graph Screenshots/GMM_hard_clustering.png', format='png', dpi=100)
plt.show()
In [17]:
k=np.argmin(BIC)+1
print(k)
# Keep the first k eigenvectors:
H1_gmm_s=G1s.U[:,:k]
D_minus_half_1_gmm_s=np.diag(np.sum(weights_wt_students,1)**(-1/2)) # weighted degrees
F1_gmm_s=D_minus_half_1_gmm_s@H1_gmm_s
# Apply GMM and do the predictions:
g = mixture.GaussianMixture(n_components=k)
g.fit(F1_gmm_s)
labels_1b=g.predict_proba(F1_gmm_s)
In [18]:
r=np.arange(k)
multi_labels=[list(r[labels_1b[p]>0.1]) for p in range(len(labels_1b))] #assignements for probability > 0.1
labels_matrix=[]
for p in range(k):
lab=[]
for q in range(len(multi_labels)):
if p in multi_labels[q]:
lab.append(q)
labels_matrix.append(lab)
In [19]:
print(multi_labels)
In [20]:
# Display all the obtained clusters:
for i in range(k):
print('\n')
ind1=labels_matrix[i]
print(Courses[ind1])
print([courses2[courses2.index.str.endswith(Courses[ind1][p])].CourseTitleFR.tolist()[0] for p in range(len(ind1))])
print('\n')
In [21]:
# Display the graph with respect to the clusters assignements:
values=[100*np.mean(multi_labels[node]) for node in range(len(multi_labels))]
G1=nx.from_numpy_matrix(weights_wt_students)
plt.figure(1,figsize=(16,10))
pos1 = nx.spring_layout(G1)
cmap=plt.get_cmap('jet')
nx.draw_networkx_nodes(G1, pos1, cmap=cmap, node_color = values) # plot the nodes
nx.draw_networkx_labels(G1, pos1) # plot the labels
nx.draw_networkx_edges(G1, pos1) # plot the edges
sm = plt.cm.ScalarMappable(cmap=cmap, norm=matplotlib.colors.Normalize(vmin=0, vmax=(k-1)))
sm._A = []
plt.colorbar(sm)
plt.title('Graph clustered with GMM (soft clustering)')
plt.show()
In [22]:
np.fill_diagonal(weights_wt_section,0)
G2s=graphs.Graph(weights_wt_section)
G2s.set_coordinates('spring')
G2s.plot()
In [23]:
laplacian_2=G2s.compute_laplacian("normalized")
G2s.compute_fourier_basis(recompute=True)
plt.plot(G2s.e, 'o')
plt.title('Eigenvalues (normalized Laplacian)')
plt.xlabel('Eigenvalue index')
plt.ylabel('Eigenvalue')
eig_2=G2s.e
eig_2[0:30]
Out[23]:
In [24]:
np.diff(eig_2[0:30])
Out[24]:
The eigenvalues are smaller than 2 -> ok since normalized Laplacian.
The number k for k-means should be defined as the number such that there is a gap in the Laplacian spectrum after the k-th eigenvalue. Here, it seems the main gap is after the 7th eigenvalue: 0.55198999
Thus, we will try k=7 for k-means.
But before that, let's compute the matrix H of first k eigenvectors of L_norm
In [25]:
k=7 #12 all, 7 STI only
# Keep the first k eigenvectors:
H2=G2s.U[:,:k]
D_minus_half_2=np.diag(np.sum(weights_wt_section,1)**(-1/2)) # weighted degrees
F2=D_minus_half_2@H2
In [26]:
# Apply K-means and do the predictions:
kmeans_2 = KMeans(n_clusters=k, random_state=0).fit(F2)
labels_2 = kmeans_2.labels_
In [27]:
G2s.plot_signal(labels_2)
In [28]:
# Display the obtained clusters:
for i in range(k):
print('\n')
ind2=np.where(labels_2==i)[0]
print(Courses[ind2])
print([courses2[courses2.index.str.endswith(Courses[ind2][p])].CourseTitleFR.tolist()[0] for p in range(len(ind2))])
print('\n')
In [29]:
#Normalize the weights matrices:
weight_matrices = [weights_wt_students, weights_wrt_assistants, weights_wrt_prof, weights_wt_section, weight_course_same_requirements, weight_course_to_requirement, weight_requirements_same_course]
for i in range(len(weight_matrices)):
# Set the diagonal of the matrix to 0
np.fill_diagonal(weight_matrices[i], 0)
weight_matrices[i] = weight_matrices[i]/np.max(weight_matrices[i].ravel())
In [30]:
#do a weighted sum of the different weight matrices:
weight_different_graph = [0.2,0,0,0,0,0,1]
W_all = weight_different_graph[0]*weight_matrices[0]
for i in range(1, len(weight_matrices)):
W_all = W_all + weight_different_graph[i]*weight_matrices[i]
In [31]:
G_alls=graphs.Graph(W_all)
G_alls.set_coordinates('spring')
G_alls.plot()
In [32]:
laplacian_all=G_alls.compute_laplacian("normalized")
G_alls.compute_fourier_basis(recompute=True)
plt.plot(G_alls.e, 'o')
plt.title('Eigenvalues (normalized Laplacian)')
plt.xlabel('Eigenvalue index')
plt.ylabel('Eigenvalue')
eig_all=G_alls.e
eig_all[0:30]
Out[32]:
In [33]:
np.diff(eig_all[0:30])
Out[33]:
The eigenvalues are smaller than 2 -> ok since normalized Laplacian.
We can see the largest gap of eigenvalues is after the first eigenvalue. According to the lecture, it suggests k=1 (k=1 cluster) for K-means. But here, we do not want only one cluster and we should not have only one. We could try other values for k with respect to the other large gaps but because its choice is not obvious, we will use other methods in the following to get the value of k.
In [34]:
#silhouette:
S=[]
kmax=20
for k in range(2,kmax):
H=G_alls.U[:,:k]
D_minus_half=np.diag(np.sum(W_all,1)**(-1/2)) # weighted degrees
F=D_minus_half@H
kmeans = KMeans(n_clusters=k, random_state=0).fit(F)
s=0
labels = kmeans.labels_
centroids=kmeans.cluster_centers_
r=np.arange(k)
for course in range(len(Courses)):
dist=scipy.spatial.distance.cdist(centroids,[F[course,:]], 'euclidean')
a=dist[labels[course]][0]
b=min(dist[r!=labels[course]])[0]
s+=(b-a)/max(a,b)
s/=len(Courses)
S.append(s)
plt.plot(range(2,kmax),S)
plt.title('Silhouette')
plt.xlabel('k')
plt.ylabel('S')
plt.savefig('Graphs/Graph Screenshots/kmeans_silhouette_merged_graph.png', format='png', dpi=100)
k=np.argmax(S)+2
print(k)
In [35]:
# Keep the first k eigenvectors:
H_all_K=G_alls.U[:,:k]
D_minus_half_all_K=np.diag(np.sum(W_all,1)**(-1/2)) # weighted degrees
F_all_K=D_minus_half_all_K@H_all_K
In [36]:
# Apply K-means :
kmeans_all = KMeans(n_clusters=k, random_state=0).fit(F_all_K)
labels_all = kmeans_all.labels_
In [37]:
G_alls.plot_signal(labels_all)
In [38]:
# Display all the obtained clusters:
for i in range(k):
print('\n')
ind_all=np.where(labels_all==i)[0]
print(Courses[ind_all])
print([courses2[courses2.index.str.endswith(Courses[ind_all][p])].CourseTitleFR.tolist()[0] for p in range(len(ind_all))])
print('\n')
In [39]:
# Display the graph with respect to the clusters assignements:
values=[100*labels_all[node] for node in range(len(labels_all))]
G_all=nx.from_numpy_matrix(W_all)
plt.figure(1,figsize=(16,10))
pos_all = nx.spring_layout(G_all)
cmap=plt.get_cmap('jet')
nx.draw_networkx_nodes(G_all, pos_all, cmap=cmap, node_color = values) # plot the nodes
nx.draw_networkx_labels(G_all, pos_all) # plot the labels
nx.draw_networkx_edges(G_all, pos_all) # plot the edges
sm = plt.cm.ScalarMappable(cmap=cmap, norm=matplotlib.colors.Normalize(vmin=0, vmax=(k-1)))
sm._A = []
plt.colorbar(sm)
plt.title('Graph clustered with Kmeans')
plt.savefig('Graphs/Graph Screenshots/kmeans_clustering_merged_graph.png', format='png', dpi=100)
plt.show()
Then, we tried to use GMM instead of K-means in order to allow elliptical distributions and also soft clustering.
In [40]:
# To determine the number of Gaussians, we compute the AIC and the BIC:
AIC=[]
BIC=[]
for k in range(1,40):
H=G_alls.U[:,:k]
D_minus_half=np.diag(np.sum(W_all,1)**(-1/2)) # weighted degrees
F=D_minus_half@H
g = mixture.GaussianMixture(n_components=k)
g.fit(F)
labels=g.predict(F)
AIC.append(g.aic(F))
BIC.append(g.bic(F))
plt.plot(range(1,40),AIC,'r')
plt.title('AIC')
plt.xlabel('k')
plt.ylabel('AIC')
plt.savefig('Graphs/Graph Screenshots/GMM_hard_AIC_merged_graph.png', format='png', dpi=100)
plt.show()
plt.plot(BIC,'b')
plt.title('BIC')
plt.xlabel('k')
plt.ylabel('BIC')
plt.savefig('Graphs/Graph Screenshots/GMM_hard_BIC_merged_graph.png', format='png', dpi=100)
plt.show()
In [41]:
k=np.argmin(BIC)+1
print(k)
# Keep the first k eigenvectors:
H_all_gmm_h=G_alls.U[:,:k]
D_minus_half_all_gmm_h=np.diag(np.sum(W_all,1)**(-1/2)) # weighted degrees
F_all_gmm_h=D_minus_half_all_gmm_h@H_all_gmm_h
# Apply GMM
g = mixture.GaussianMixture(n_components=k)
g.fit(F_all_gmm_h)
labels_allc=g.predict(F_all_gmm_h)
In [42]:
# Display all the obtained clusters:
for i in range(k):
print('\n')
ind_all=np.where(labels_allc==i)[0]
print(Courses[ind_all])
print([courses2[courses2.index.str.endswith(Courses[ind_all][p])].CourseTitleFR.tolist()[0] for p in range(len(ind_all))])
print('\n')
In [43]:
# Display the graph with respect to the clusters assignements:
values=[100*labels_allc[node] for node in range(len(labels_allc))]
G_all=nx.from_numpy_matrix(W_all)
plt.figure(1,figsize=(16,10))
pos_all = nx.spring_layout(G_all)
cmap=plt.get_cmap('jet')
nx.draw_networkx_nodes(G_all, pos_all, cmap=cmap, node_color = values) # plot the nodes
nx.draw_networkx_labels(G_all, pos_all) # plot the labels
nx.draw_networkx_edges(G_all, pos_all) # plot the edges
sm = plt.cm.ScalarMappable(cmap=cmap, norm=matplotlib.colors.Normalize(vmin=0, vmax=(k-1)))
sm._A = []
plt.colorbar(sm)
plt.title('Graph clustered with GMM (hard clustering)')
plt.savefig('Graphs/Graph Screenshots/GMM_hard_clustering_merged_graph.png', format='png', dpi=100)
plt.show()
In [44]:
k=np.argmin(BIC)+1
print(k)
# Keep the first k eigenvectors:
H_all_gmm_s=G_alls.U[:,:k]
D_minus_half_all_gmm_s=np.diag(np.sum(W_all,1)**(-1/2)) # weighted degrees
F_all_gmm_s=D_minus_half_all_gmm_s@H_all_gmm_s
# Apply GMM:
g = mixture.GaussianMixture(n_components=k)
g.fit(F_all_gmm_s)
labels_allb=g.predict_proba(F_all_gmm_s)
In [45]:
r=np.arange(k)
multi_labels_all=[list(r[labels_allb[p]>0.1]) for p in range(len(labels_allb))] # assignements with probability > 0.1
labels_matrix_all=[]
for p in range(k):
lab=[]
for q in range(len(multi_labels_all)):
if p in multi_labels_all[q]:
lab.append(q)
labels_matrix_all.append(lab)
In [46]:
print(multi_labels_all)
In [47]:
# Display all the obtained clusters:
for i in range(k):
print('\n')
ind_all=labels_matrix_all[i]
print(Courses[ind_all])
print([courses2[courses2.index.str.endswith(Courses[ind_all][p])].CourseTitleFR.tolist()[0] for p in range(len(ind_all))])
print('\n')
In [48]:
# Display the graph with respect to the clusters assignements:
values=[100*np.mean(multi_labels_all[node]) for node in range(len(multi_labels_all))]
G_all=nx.from_numpy_matrix(W_all)
plt.figure(1,figsize=(16,10))
pos_all = nx.spring_layout(G_all)
cmap=plt.get_cmap('jet')
nx.draw_networkx_nodes(G_all, pos_all, cmap=cmap, node_color = values) # plot the nodes
nx.draw_networkx_labels(G_all, pos_all) # plot the labels
nx.draw_networkx_edges(G_all, pos_all) # plot the edges
sm = plt.cm.ScalarMappable(cmap=cmap, norm=matplotlib.colors.Normalize(vmin=0, vmax=(k-1)))
sm._A = []
plt.colorbar(sm)
plt.title('Graph clustered with GMM (soft clustering)')
plt.show()
In [49]:
def suggest_wrt1(course):
# Deal with the case course is a string and not a list of strings:
if type(course)==str:
course=[course]
print('Results with K-means:\n')
L=[]
indices1=[]
for q in range(len(course)):
L.append(labels_1[courses_index_dico[course[q]]]) #list the labels of the courses in course
indices1.append(courses_index_dico[course[q]]) #indices of the chosen course of course
L=Counter(L).most_common(1)[0][0] #keep the label L of the most common cluster
ind=np.where(labels_1==L)[0] # get the indices of the courses of the cluster
indices1=np.array(indices1)
indices=list(indices1[list(np.isin(indices1, ind))]) # indices of the chosen courses that are in the most common cluster
# Compute the distances between the courses of the chosen cluster and the remaining chosen courses:
dist=scipy.spatial.distance.cdist(F1_K[ind,:],F1_K[indices,:], 'euclidean')
dist=np.mean(dist, axis=1)
# Output the list of courses in the cluster ordered with respect to the closest distance with the chosen courses:
ind=ind[np.argsort(dist)]
print(Courses[ind])
print([courses2[courses2.index.str.endswith(Courses[ind][p])].CourseTitleFR.tolist()[0] for p in range(len(ind))])
print('\n')
print('\nResults with GMM soft-clustering:\n')
if len(course)==1:
L=multi_labels[courses_index_dico[course[0]]] #keep the labels L of the clusters
else:
L=[item for q in range(len(course)) for item in multi_labels[courses_index_dico[course[q]]]]
L=[Counter(L).most_common(1)[0][0]] #keep the label L of the most common cluster
for p in range(len(L)):
ind=np.array(labels_matrix[L[p]]) # get the indices of the courses of the cluster
indices=list(indices1[list(np.isin(indices1, ind))]) # indices of the chosen courses that are in the most common cluster
# Compute the distances between the courses of the chosen cluster and the remaining chosen courses:
dist=scipy.spatial.distance.cdist(F1_gmm_s[ind,:],F1_gmm_s[indices,:], 'euclidean')
dist=np.mean(dist, axis=1)
# Output the list of courses in the cluster ordered with respect to the closest distance with the chosen courses:
ind=ind[np.argsort(dist)]
print(Courses[ind])
print([courses2[courses2.index.str.endswith(Courses[ind][p])].CourseTitleFR.tolist()[0] for p in range(len(ind))])
print('\n')
print('\nResults with GMM hard-clustering:\n')
L=[]
for q in range(len(course)):
L.append(labels_1c[courses_index_dico[course[q]]])
L=Counter(L).most_common(1)[0][0] #keep the label L of the most common cluster
ind=np.where(labels_1c==L)[0] # get the indices of the courses of the cluster
indices=list(indices1[list(np.isin(indices1, ind))]) # indices of the chosen courses that are in the most common cluster
# Compute the distances between the courses of the chosen cluster and the remaining chosen courses:
dist=scipy.spatial.distance.cdist(F1_gmm_h[ind,:],F1_gmm_h[indices,:], 'euclidean')
dist=np.mean(dist, axis=1)
# Output the list of courses in the cluster ordered with respect to the closest distance with the chosen courses:
ind=ind[np.argsort(dist)]
print(Courses[ind])
print([courses2[courses2.index.str.endswith(Courses[ind][p])].CourseTitleFR.tolist()[0] for p in range(len(ind))])
In [50]:
suggest_wrt1('EE-554')
In [51]:
c=['EE-535', 'EE-420']
print([courses2[courses2.index.str.endswith(c[p])].CourseTitleFR.tolist()[0] for p in range(len(c))])
print('\n')
suggest_wrt1(c)
In [52]:
def suggest_wrt2(course):
# Deal with the case course is a string and not a list of strings:
if type(course)==str:
course=[course]
print('Results with K-means:\n')
L=[]
indices1=[]
for q in range(len(course)):
L.append(labels_all[courses_index_dico[course[q]]]) #list the labels of the courses in course
indices1.append(courses_index_dico[course[q]]) #indices of the chosen course of course
L=Counter(L).most_common(1)[0][0] #keep the label L of the most common cluster
ind=np.where(labels_all==L)[0] # get the indices of the courses of the cluster
indices1=np.array(indices1)
indices=list(indices1[list(np.isin(indices1, ind))]) # indices of the chosen courses that are in the most common cluster
# Compute the distances between the courses of the chosen cluster and the remaining chosen courses:
dist=scipy.spatial.distance.cdist(F_all_K[ind,:],F_all_K[indices,:], 'euclidean')
dist=np.mean(dist, axis=1)
# Output the list of courses in the cluster ordered with respect to the closest distance with the chosen courses:
ind=ind[np.argsort(dist)]
print(Courses[ind])
print([courses2[courses2.index.str.endswith(Courses[ind][p])].CourseTitleFR.tolist()[0] for p in range(len(ind))])
print('\n')
print('\nResults with GMM soft-clustering:\n')
if len(course)==1:
L=multi_labels_all[courses_index_dico[course[0]]] #keep the labels L of the clusters
else:
L=[item for q in range(len(course)) for item in multi_labels_all[courses_index_dico[course[q]]]]
L=[Counter(L).most_common(1)[0][0]] #keep the label L of the most common cluster
for p in range(len(L)):
ind=np.array(labels_matrix_all[L[p]]) # get the indices of the courses of the cluster
indices=list(indices1[list(np.isin(indices1, ind))]) # indices of the chosen courses that are in the most common cluster
# Compute the distances between the courses of the chosen cluster and the remaining chosen courses:
dist=scipy.spatial.distance.cdist(F_all_gmm_s[ind,:],F_all_gmm_s[indices,:], 'euclidean')
dist=np.mean(dist, axis=1)
# Output the list of courses in the cluster ordered with respect to the closest distance with the chosen courses:
ind=ind[np.argsort(dist)]
print(Courses[ind])
print([courses2[courses2.index.str.endswith(Courses[ind][p])].CourseTitleFR.tolist()[0] for p in range(len(ind))])
print('\n')
print('\nResults with GMM hard-clustering:\n')
L=[]
for q in range(len(course)):
L.append(labels_allc[courses_index_dico[course[q]]])
L=Counter(L).most_common(1)[0][0] #keep the label L of the most common cluster
ind=np.where(labels_allc==L)[0] # get the indices of the courses of the cluster
indices=list(indices1[list(np.isin(indices1, ind))]) # indices of the chosen courses that are in the most common cluster
# Compute the distances between the courses of the chosen cluster and the remaining chosen courses:
dist=scipy.spatial.distance.cdist(F_all_gmm_h[ind,:],F_all_gmm_h[indices,:], 'euclidean')
dist=np.mean(dist, axis=1)
# Output the list of courses in the cluster ordered with respect to the closest distance with the chosen courses:
ind=ind[np.argsort(dist)]
print(Courses[ind])
print([courses2[courses2.index.str.endswith(Courses[ind][p])].CourseTitleFR.tolist()[0] for p in range(len(ind))])
In [53]:
suggest_wrt2('EE-554')
In [54]:
c=['EE-535', 'EE-420']
print([courses2[courses2.index.str.endswith(c[p])].CourseTitleFR.tolist()[0] for p in range(len(c))])
print('\n')
suggest_wrt2(c)
In [ ]: