In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import scipy
import scipy.sparse as sp
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm
import pickle
import os
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
%load_ext autoreload
%autoreload 2
The data had issues in encoding and could not be read properly on different operating systems. That is why we needed to reformat it. The results are saved with the extension _new.
In [2]:
#replace unnecessary double quotes in files
for file in ["PV_Course_Enrolments", "PV_Course_Descriptions"]:
with open('../data/{}.csv'.format(file),'r',encoding="utf8") as courses_old:
with open('../data/{}_new.csv'.format(file),'w',encoding="utf8") as courses_new:
lines=courses_old.readlines()
for line in lines:
if(len(line)>2):
line=re.sub(r'(?<!,)"(?!,)',"'",line)
courses_new.write(line)
This portion handles the course descriptions as provided on the EPFL website. It contains multiple attributes characterizing the different courses. The main issue with this dataset is that the data is not standard and significant cleaning is required, especially for textual inputs.
In [3]:
courses=pd.read_csv("../data/PV_Course_Descriptions_new.csv",quotechar='"', iterator=False, encoding="utf8",
error_bad_lines=False)
courses.columns
Out[3]:
In [4]:
courses.drop( ['ExpectedWork_FR', 'ExpectedWork_EN',
'Other_FR', 'Other_EN',
'ProjectManagement', 'WorkInSociety', 'PersonalEffectiveness',
'TrainingAchievements', 'WorkInGroupsAndOrgs',
'CommunicateInformation','Handouts_FR', 'Handouts_EN',
'Bibliography_FR', 'Bibliography_EN','TeachingMethod_FR', 'TeachingMethod_EN',
'EvaluationMethod_FR','EvaluationMethod_EN' ], axis = 1, inplace = True)
print(courses.columns)
courses.set_index("CourseCode", inplace = True)
In [5]:
# suppress incorect <p> html balise
def findCorrectBalise(string):
if(type(string)==type('')):
string=re.sub(r'!-*\[endif\] ?-->(?!<//?p)', '!--[endif] --></p>\r\n<p>',string)
begIndex=list(re.finditer('<p>',string))
endIndex=list(re.finditer('</p>',string))
for openP,closeP in zip(begIndex,endIndex):
#print(openP.start(),closeP.end())
if 'mso' not in string[openP.start():closeP.end()] and 'padding-top' not in string[openP.start():closeP.end()] and '>&#160;<' not in string[openP.start():closeP.end()]:
return string[openP.start():closeP.end()]
return string
In [6]:
# replace html code with unicode character
def htmlToStr(match):
match =match.group()
code=int(re.findall(r'(?<=\&#)\d+(?=;)',match)[0])
return chr(code)
mapToChange={'<squote/>':"'",'<comma/>':",",'<p>':'',''':"'",' & ':' et ','<->':'-',
'</p>':'','<br />':'\r\n','<ul>':'','</li>':'','</ul>':'','<li>':'',
'<ol>':'','</ol>':'','<i>':'','</i>':'','"':"'",'</em>':'','<em>':'',
'<b>':'','</b>':'','<strong>':'','</strong>':'','&amp;':'&',
'<a.*>':'','</a>':'','<!-+.*-+>':'','<span.+>':'','&lt;':'<','&gt;':'>'}
for col in courses.columns:
courses[col]=courses[col].apply(findCorrectBalise)
for pattern in mapToChange.keys():
courses[col]=courses[col].str.replace(pattern, mapToChange[pattern],case=True)
courses[col]=courses[col].str.replace(r'&#[0-9]{3,5};',htmlToStr,case=True)
In [7]:
print("There are {} courses".format(len(courses)))
courses.head()
Out[7]:
In [ ]:
Removing Social Science Courses
In [8]:
courses = courses[courses.index.str.contains("HUM") == False]
#ARcourse = courses.index[courses.StudyPlansFR.str.contains('AR')].tolist()
#SIEcourse = courses.index[courses.StudyPlansFR.str.contains('ENV')].tolist()
#GCcourse = courses.index[courses.StudyPlansFR.str.contains('^ ?GC| ?; ?GC')].tolist()
#ARcourse=list(set(ARcourse)-set(SIEcourse)-set(GCcourse))
#courses = courses[~courses.index.isin(ARcourse)]
#courses = courses[~courses.index.str.contains("AR|HUM|BIO|CH|ChE|CIVIL|ENV|ETH|HEP|MATH|PHYS")]
print("There are {} courses".format(len(courses)))
courses.head()
Out[8]:
Removing First Year Courses
In [9]:
for idx in courses.index:
if(idx=='BIO-105' or idx=='PHYS-114'):
continue
m=re.search(r'\d', idx)
if idx[m.start()] == "1":
courses.drop(idx, inplace = True)
print("There are {} courses".format(len(courses)))
In [10]:
print(len(courses))
for col in courses.columns.tolist():
print(col, len(courses[col].unique()))
In [11]:
import re
def remove_numbers(string):
if string is not np.nan:
return re.sub(r'[0-9]+', '', string)
else :
return string
def remove_all_after_word(word, string, included = False):
if string:
string = re.sub(r'[0-9]+', '', string)
r = re.compile('(.*?)(?:{}.*?$|\Z)'.format(word),
re.MULTILINE|re.DOTALL)
if word in string:
if included :
return ''.join(r.findall(string))
else :
return ''.join(r.findall(string))+word
else:
return string
return string
In [12]:
courses = courses.join(courses.StudyPlansFR.str.split(";",expand=True))
print(len(courses["StudyPlansFR"].unique()))
Looking at columns to see how many came from the split
In [13]:
courses.columns
Out[13]:
In [14]:
for study_plan in range(16):
for word in ["HES", "Bachelor","Echange", "Master"]:
courses[study_plan]= courses[study_plan].apply(lambda x: remove_all_after_word(word, x, included = True))
for word in ["UNIL", "STAS", "CGC", "AR", "GC", "GM", "IN", "IF", "MA", "SV",\
"SIE", "SHS", "PH", "HD", "MES", "MTEE", "MT ", "SC", "SIE", "MX"]:
courses[study_plan]= courses[study_plan].apply(lambda x: remove_all_after_word(word, x, included = False))
courses[study_plan]= courses[study_plan].str.replace(' - ', ' ').replace('- ', '').replace(' -', '').replace('-', '')
study_plans = []
for i in range(16):
study_plans += list(courses[i].unique())
study_plans = sorted(list(set(list(filter(None,study_plans)))))
print(len(study_plans))
study_plans
Out[14]:
In [15]:
courses["StudyPlansEN"]= courses["StudyPlansEN"].apply(lambda x:remove_numbers(x))
cols = [i for i in range(16)]+["StudyPlansEN"]
courses["StudyPlans"] = courses[cols].apply(lambda x: '; '.join(sorted(list(set(x.dropna().astype(str))))),axis=1)
cols +=["StudyPlansFR"]
courses.drop(cols, axis = 1, inplace = True)
In [16]:
courses["StudyPlans"].unique()
Out[16]:
High because all the possible combinations are taken into account and were not sorted.
In [17]:
courses.columns
Out[17]:
In [18]:
courses.head()
Out[18]:
In [19]:
print("Mandatory pre-requirements EN : ",len(courses.MandatoryPrerequirements_EN.unique()))
print("Mandatory pre-requirements FR : ",len(courses.MandatoryPrerequirements_FR.unique()))
print("Indicative pre-requirements EN : ",len(courses.IndicativePrerequirements_EN.unique()))
print("Indicative pre-requirements FR : ",len(courses.IndicativePrerequirements_FR.unique()))
print("Prepares for EN : ",len(courses.PreparesFor_EN.unique()))
print("Prepares for FR : ",len(courses.PreparesFor_FR.unique()))
In [20]:
# Helper functions to extract course codes from strings
def format_course_codes(x):
# Remove extra parenthesis
x = x[1:-1] if (x[0] == "(" and x[-1] == ")") else x
x = x[1:] if x[0] == "(" else x
x = x[:-1] if not "(" in x and x[-1] == ")" else x
# Replace the space with a dash
x = x.replace(" ", "-")
# If there is no dash in the course code insert a dash before the first number
m=re.search(r'-', x)
if not m:
m=re.search(r'\d', x)
x = x[0:m.start()]+"-"+x[m.start():]
# If the last element in the string is a letter, add the parenthesis around it
if not str.isdigit(x[-1]) and not x[-1] == ")":
x = x[0:-1]+"("+x[-1]+")"
return x
def identify_course_code_in_string(string):
# if the string is valid
if type(string) == str:
# Use the regex to find the course code pattern
pattern = re.compile('\(?[A-Z]{2,6}-? ?[0-9]{3}\(?[A-Za-z]?\)?\)?') # [a-zA-Z]{2,5}-?[0-9]{3}(\([A-Za-z]{1}\)|[A-Za-z])?
found_patterns = pattern.findall(string)
# If a pattern was found return the content
if found_patterns:
found_patterns = [format_course_codes(x) for x in found_patterns]
return found_patterns
else :
return []
def identify_course_name_in_string(string):
course_codes = []
# if the string is valid
if type(string) == str:
# Strings which do not contain any recommendations to begin with (all outputs were verified manually)
m=re.search(r'none|no prerequisits|No recommended courses', string)
if m or (len(string) <5) or ((("no " in string[0:2]) or ("non " in string[0:3])) and (len(string)<10))\
or ("Rien" in string[0:5]) or ("cycle bachelor" in string) \
or ("Aucun" in string[0:10]) or ("aucun" in string[0:10]) or ("Néant" in string[0:10])\
or ((("Bachelor" in string[0:10]) or("bachelor" in string[0:10]) or("Master" in string[0:10]) or ("MASTER" in string[0:10])) and len(string)<30)\
or "master in" in string or "MTE diploma work" in string or "Term papers" in string\
or "Bachelor in Life Science" in string \
or (("N/A" in string) and (len(string) < 5))\
or ((("Aucun" in string) or ('aucun' in string)) and (len(string) < 7)):
return []
# String matching
else :
match = []
for column in ["CourseTitleFR", "CourseTitleEN"]:
for idx, title in enumerate(courses[column].tolist()):
if type(title) == str:
if title.lower() in string.lower():
match.append(courses.index.tolist()[idx])
#print(title.lower(), courses.index.tolist()[idx])
#print("-------------------\n","Match :", match, string)
#print("-------------------\n",string)
if len(match):
return match
else :
return string
else :
return string
def extract_course_codes(string, do_prints = False):
if type(string) == str:
course_code = identify_course_code_in_string(string)
if len(course_code):
if do_prints: print("-------------------\n",course_code, string, "\n-------------------")
return course_code
else:
course_name = identify_course_name_in_string(string)
if len(course_name):
if do_prints: print("-------------------\n",course_name, string,"-------------------\n")
return course_name
else :
if do_prints: print("-------------------\n",string)
return string
else:
return []
In [21]:
# Extracting course codes in the different fields
courses2 = courses.copy()
requirements_columns = ["MandatoryPrerequirements_EN", "MandatoryPrerequirements_FR", "IndicativePrerequirements_EN", \
"IndicativePrerequirements_FR", "PreparesFor_EN", "PreparesFor_FR"]
for column in requirements_columns:
courses2[column] = courses2[column].apply(lambda x: extract_course_codes(x))
num_matched = 0
no_req = 0
for idx in range(len(courses2)):
if type(courses2[column][idx]) is list:
if len(courses2[column][idx]):
num_matched = num_matched+1
else:
no_req = no_req +1
print("--------------------------------------")
print(column, len(courses2))
print("{:.2f}% matched ".format(num_matched/len(courses2)*100))
print("{:.2f}% without pre-requirements".format(no_req/len(courses2)*100))
print("{:.2f}% to be matched".format(100-(num_matched+no_req)/len(courses2)*100))
print("--------------------------------------")
print(len(courses2))
In [22]:
courses2["Requirements"] = " "
reqs = 0
for course in courses2.index.tolist():
requirements = []
for column in requirements_columns:
column_requirements = courses2.loc[course, column]
if type(column_requirements) is list:
requirements.append(column_requirements)
requirements = list(set(sum(requirements, [])))
if len(requirements):
#print(requirements)
reqs = reqs +1
#else :
#print(requirements)
courses2.loc[course, "Requirements"] = requirements
print("Percentage of courses with requirements {:.2f}% of {}".format(reqs/len(courses2)*100, len(courses2)))
In [23]:
courses2.head()
Out[23]:
In [24]:
courses2.drop(requirements_columns, axis = 1, inplace = True)
courses = courses2.copy()
del courses2
courses.to_pickle("../data/cleaned_courses.pickle")
In [25]:
courses.head()
Out[25]:
For each course we will create a string of lemmatized keywords without stopwords
In [26]:
courses = pd.read_pickle("../data/cleaned_courses.pickle")
courses.head()
Out[26]:
In [27]:
# Splitting Keywords helper function
def split_keywords(x):
if type(x) is str:
x = x.lower()
for symbol in [";",",",".","\n"," - ", "- ","_"]:
if type(x) is not str:
temp = list()
for word in x:
temp += word.split(symbol)
x = temp
else:
x = x.split(symbol)
return list(filter(None, x))
else:
return
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
# Lemmatization functions
punctuation = string.punctuation+'“’—.”’“--,”' # pimp the list of punctuation to remove
# Taken from the ADA tutorial on text handling
def rem_stop(txt,stop_words=stopwords.words("english"),lower=True,punct=True):
"""
Removes stopwords, punct and other things from a text, inc. numbers
:param list txt: text tokens (list of str)
:param list stop_words: stopwords to remove (list of str)
:param bol lower: if to lowercase
:param bol punct: if to rid punctuation
"""
if type(txt) is list:
if lower and punct:
return [t.lower() for t in txt if t.lower() not in stop_words and t.lower() not in punctuation and not t.isdigit()]
elif lower:
return [t.lower() for t in txt if t.lower() not in stop_words and not t.isdigit()]
elif punct:
return [t for t in txt if t.lower() not in stop_words and t.lower() not in punctuation and not t.isdigit()]
return [t for t in txt if t.lower() not in stop_words and not t.isdigit()]
else:
return
def get_wordnet_pos(treebank_tag):
"""
Cf. https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
:param treebank_tag: a tag from nltk.pos_tag treebank
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def lemmatization(list_of_words, language):
if type(list_of_words) is list:
# Remove stop words
clean_words = rem_stop(list_of_words, stop_words=stopwords.words(language))
# Get an array of arrays
clean_words = [x.split() for x in clean_words]
# lemmatizer
wnl = nltk.WordNetLemmatizer()
out = list()
for words in clean_words:
interm_list = list()
for word in words:
# Get the type of the words
pos = nltk.pos_tag([word])
interm_list.append(lemma.lemmatize(word,get_wordnet_pos(pos[0][1])))
out.append(" ".join(interm_list))
return out
else:
return []
Formatting the different columns
In [28]:
for columns, language in zip(["KeyWords_EN", "Summary_EN", "ImportantConcepts_EN", "Content_EN",
"KeyWords_FR", "Summary_FR", "ImportantConcepts_FR", "Content_FR"],
["english", "english", "english","english",
"french", "french", "french", "french"]):
courses[columns] = courses[columns].apply(lambda x: split_keywords(x))
courses[columns] = courses[columns].apply(lambda x: lemmatization(x, language))
Merging the summaries and important concepts in each of the languages
In [29]:
courses["Summary_Concepts_Contents_EN"] = " "
courses["Summary_Concepts_Contents_FR"] = " "
concepts_cols = ["ImportantConcepts_EN", "ImportantConcepts_FR"]
summary_cols = ["Summary_EN","Summary_FR"]
content_cols = ["Content_EN", "Content_FR"]
final_cols = ["Summary_Concepts_Contents_EN", "Summary_Concepts_Contents_FR"]
for course in courses.index.tolist():
for concepts_col, summary_col, content_col, merged_col in zip(concepts_cols,summary_cols, content_cols, final_cols):
summary = courses.loc[course, summary_col]
concepts = courses.loc[course,concepts_col]
if summary and concepts:
courses.loc[course, merged_col] = list(set(concepts + summary))
elif summary :
courses.loc[course, merged_col] = list(set(summary))
elif concepts:
courses.loc[course, merged_col] = list(set(concepts))
courses.drop(concepts_cols + summary_cols + content_cols, axis = 1, inplace = True)
courses.head()
Out[29]:
In [30]:
enrol=pd.read_csv("../data/PV_Course_Enrolments_new.csv",quotechar='"', iterator=False, encoding="utf8", error_bad_lines=False)
enrol.columns.values.tolist()
Out[30]:
In [31]:
enrol.drop(['CourseEnrolmentID','Session','LevelName','TeachingLanguage','StudyDomain', 'StudyPlanName'],axis=1,inplace=True)
In [32]:
for year in enrol['Year'].values.tolist():
years=year.split('-')
if len(years[0])!=4 or len(years[1])!=4:
print(years)
In [33]:
enrol['Year'].unique()
Out[33]:
In [34]:
print(enrol['SectionCode'].unique().tolist())
Removing all non essential suffixes and prefixes
In [35]:
enrol['SectionCode']= enrol.SectionCode.str.replace('ENS_|ING_|ECH_|_HES|_ECH|_NE|_B|_STV|_ING|_CHIM|_CO|_EPFL|_EURECOM|_HEP', '')
enrol['SectionCode']= enrol.SectionCode.str.replace('MTEE', 'MTE')
print(enrol['SectionCode'].unique().tolist())
In [36]:
enrol['StudyPlanCode'].value_counts()
Out[36]:
In [37]:
enrol['StudyPlanCode']= enrol.StudyPlanCode.str.replace('BA5B?', 'BA5')
enrol['StudyPlanCode']= enrol.StudyPlanCode.str.replace('BA6B?', 'BA6')
enrol['StudyPlanCode']= enrol.StudyPlanCode.str.replace('PDMAUT|PDMPRI', 'PDM')
print(enrol['StudyPlanCode'].unique().tolist())
Handling the SEM_PRINTEMPS and SEM_AUTOMNE semesters by using the course code information to acquire the year.
In [38]:
enrol = enrol.join(enrol.CourseCodes.str.split("-",expand=True))
enrol.drop([2, 3],axis = 1, inplace = True)
enrol.rename(columns={"SectionCode":"StudentSectionCode",0:"CourseSection", 1:"CourseID"}, inplace = True)
enrol.head()
Out[38]:
In [39]:
# Mapping to link the years and semesters to the StudyPlanCode
course_semesters ={("0", "SEM_AUTOMNE"):"PREPA1", ("0", "SEM_PRINTEMPS"):"PREPA2",\
("1", "SEM_AUTOMNE"):"BA1", ("1", "SEM_PRINTEMPS"):"BA2",\
("2", "SEM_AUTOMNE"):"BA3", ("2", "SEM_PRINTEMPS"):"BA4",\
("3", "SEM_AUTOMNE"):"BA5", ("3", "SEM_PRINTEMPS"):"BA6",\
("4", "SEM_AUTOMNE"):"MA1", ("4", "SEM_PRINTEMPS"):"MA2",\
("5", "SEM_AUTOMNE"):"MA3", ("5", "SEM_PRINTEMPS"):"MA4",}
for semester in ["SEM_AUTOMNE", "SEM_PRINTEMPS"]:
for year in ["0", "1", "2", "3", "4", "5"]:
idx = (enrol.CourseID.str[0]==year) & (enrol.StudyPlanCode ==semester)
#print(course_semesters[(year, semester)])
enrol.loc[idx, "StudyPlanCode"]= course_semesters[(year, semester)]
In [40]:
print(enrol['StudyPlanCode'].unique().tolist())
Computing the number of years a course was given to be used as a normalization factor
In [41]:
enrol_years = dict()
years = pd.DataFrame(enrol.groupby("CourseCodes").Year.unique())
years.reset_index(inplace = True)
years.CourseCodes = years.CourseCodes.apply(lambda x : x.split(";")[0])
years.Year = years.Year.apply(lambda x : len(x))
years.set_index("CourseCodes", inplace = True)
years = years.to_dict()['Year']
output = open(os.path.join("Graphs",'years.pkl'), 'wb')
# Pickle dictionary using protocol 0.
pickle.dump(years, output)
output.close()
plt.hist(list(years.values()))
Out[41]:
In [42]:
print("Before removing duplicates : ", len(enrol))
enrol.drop_duplicates(subset = ["PersonID", "CourseCodes"], keep = "last", inplace = True)
print("After removing duplicates : ", len(enrol))
In [43]:
# Remove the internships
enrol=enrol[~enrol['SubjectName'].str.contains("Stage|stage|Internship|internship")]
print(len(enrol))
# Remove the PDM and the useless years
enrol=enrol[~enrol['StudyPlanCode'].str.contains("BA1|BA2|BA3|BA4|PREPA1|PREPA2")]
enrol=enrol[~enrol['SubjectName'].str.contains("Projet|projet|Project|project")]
# Keep only the course that still exist:
valid_courses=courses.index.unique().tolist()
enrol=enrol[enrol['CourseCodes'].isin(valid_courses)]
# Do the same for the courses
courses=courses[courses.index.isin(enrol['CourseCodes'].unique())]
enrol.to_pickle("../data/cleanedAR_enrol_initial.pickle")
courses.to_pickle("../data/cleanedAR_courses_initial.pickle")
print(len(enrol))
# Remove the SHS:
enrol=enrol[~enrol['CourseSection'].str.contains("HUM")]
# Remove courses that only Architects take
ARcourse = courses.index[courses.StudyPlans.str.contains('AR')].tolist()
SIEcourse = courses.index[courses.StudyPlans.str.contains('SIE')].tolist()
MEScourse = courses.index[courses.StudyPlans.str.contains('MES')].tolist()
GCcourse = courses.index[courses.StudyPlans.str.contains('^ ?GC| ?; ?GC')].tolist()
INcourse = courses.index[courses.StudyPlans.str.contains('^ ?IN| ?; ?IN|^ ?SC| ?; ?SC')].tolist()
SBcourse = courses.index[courses.StudyPlans.str.contains('^ ?SB| ?; ?SB')].tolist()
SVcourse = courses.index[courses.StudyPlans.str.contains('^ ?SV| ?; ?SV')].tolist()
STIcourse = courses.index[courses.StudyPlans.str.contains('^ ?GM| ?; ?GM|^ ?MT| ?; ?MT|^ ?EL| ?; ?EL|^ ?MX| ?; ?MX')].tolist()
ARcourse=list(set(ARcourse)|set(SIEcourse)-set(INcourse)-set(SBcourse)-set(STIcourse)-set(SVcourse))
ARcourse.append('AR-461')
enrol=enrol[~enrol.CourseCodes.isin(ARcourse)]
print(len(enrol), len(courses))
In [44]:
print(len(enrol))
for col in enrol.columns.tolist():
print(col, len(enrol[col].unique()))
In [45]:
# Keep only the course that still exist:
valid_courses=courses.index.unique().tolist()
enrol=enrol[enrol['CourseCodes'].isin(valid_courses)]
# Do the same for the courses
courses=courses[courses.index.isin(enrol['CourseCodes'].unique())]
print(len(courses))
enrol.to_pickle("../data/cleaned_enrol.pickle")
courses.to_pickle("../data/cleaned_courses.pickle")
In [46]:
enrol = enrol[enrol.StudyPlanCode != 'BA5']
enrol = enrol[enrol.StudyPlanCode != 'BA6']
enrol_tmp = enrol.copy()
enrol = enrol_tmp[enrol_tmp.CourseSection == 'ME']
enrol = enrol.append(enrol_tmp[enrol_tmp.CourseSection == 'MICRO'], ignore_index=True)
enrol = enrol.append(enrol_tmp[enrol_tmp.CourseSection == 'EE'], ignore_index=True)
enrol = enrol.append(enrol_tmp[enrol_tmp.CourseSection == 'MSE'], ignore_index=True)
enrol = enrol.append(enrol_tmp[enrol_tmp.CourseSection == 'BIOENG'], ignore_index=True)
courses=courses[courses.index.isin(enrol['CourseCodes'].unique())]
print(len(courses))
enrol.to_pickle("../data/cleaned_enrol_STI.pickle")
courses.to_pickle("../data/cleaned_courses_STI.pickle")