In [1]:
# Import libraries
import requests
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import numpy as np
import re
import unicodedata
from sklearn.decomposition import PCA
%matplotlib inline
In part 1 (top universities) we will create functions to compute and plot the different required ratios that we will reuse in part 2.
To scrap the datas (for both website) we used postman and saw there were a json string we could scrap and parse to get the ranking. For www.topuniversities.com we had to scrap the university specific page to get some more complete informations such as the number of students.
The ranking is broken and contains only the top 199 universities even though the last university has rank 200, if we want to have 200 universities we should include the university ranked 201, as we want the top 200, we will take only the 199 first universities.
In [2]:
r = requests.get("https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt")
json_dict = r.json()
json_dict = json_dict['data']
json_dict = json_dict[:199]
for e in json_dict:
#Delete useless informations
del e['nid']
del e['logo']
del e['core_id']
del e['score']
del e['cc']
del e['stars']
del e['guide']
#Get the university specific url
r = requests.get("https://www.topuniversities.com/" + e['url'])
soup = BeautifulSoup(r.text, 'html.parser')
#Get faculty members numbers
scrap_list = soup.find_all('div', class_='faculty-main wrapper col-md-4')
if(len(scrap_list) == 0): #If no informations about the faculty members is given
e['# Faculty members'] = 0
e['# Int. faculty members'] = 0
else:
n_list = scrap_list[0].find_all(class_="number")
if(len(n_list) == 2):
e['# Faculty members'] = int(re.sub(',', '', n_list[0].text.strip()))
e['# Int. faculty members'] = int(re.sub(',', '', n_list[1].text.strip()))
else: #If they don't have international faculty member
e['# Faculty members'] = int(re.sub(',', '', n_list[0].text.strip()))
e['# Int. faculty members'] = 0
#Get students numbers
n_list = soup.find_all('div', class_='students-main wrapper col-md-4')
if(len(n_list) != 0):
e['# Students'] = int(re.sub(',', '', n_list[0].find_all(class_="number")[0].text.strip()))
else:
e['# Students'] = 0
n_list = soup.find_all('div', class_='int-students-main wrapper col-md-4')
if(len(n_list) != 0):
e['# Int. students'] = int(re.sub(',', '', n_list[0].find_all(class_="number")[0].text.strip()))
else:
e['# Int. students'] = 0
#We don't need the url anymore, we can delete it
del e['url']
df1 = pd.DataFrame(json_dict)
df1=df1.rename(columns = {'rank_display':'Rank', 'title' : 'University', 'country' : 'Country', 'region' : 'Region'})
df1["Rank"] = df1["Rank"].apply(lambda x: int(re.sub("[^0-9]", "", x)))
df1.index = df1['University']
del df1['University']
Best university according to ratio between faculty members and students. We assume that a bigger ratio is better. Even though it might not be desirable, we assume it is the case for this exercise. We plot the bar charts only for the first 30 top universites (according to the ratio) to get an idea of the trend.
In [3]:
def uni_ratio_fac_stu(df):
#Compute ratio
df['Ratio Faculty members'] = df['# Faculty members']/df['# Students']
#Sort the rows according to ratio and locate the top 30 ones
df_sorted_ratios = df.sort_values(by=['Ratio Faculty members'], ascending=False).head(30)
#Plot the bar chart and print the best one
fig, ax = plt.subplots(figsize=(20,10))
plot = sns.barplot(y=df_sorted_ratios['Ratio Faculty members'], x=df_sorted_ratios.index, ax=ax)
plot.set_xticklabels(plot.get_xticklabels(), rotation=90)
print("Best university according to ratio between faculty members and students is : " + df_sorted_ratios.iloc[0].name)
#Delete the ratio column of the dataframe
del df_sorted_ratios
In [4]:
uni_ratio_fac_stu(df1)
As we can see except for the first few universites which clearly have a higher ratio than the others, the ratio tends to decrease in a linear fashion
Let's compute the best university according to ratio between local and international students. We assume that a ratio of 1 is the best, that is the same number of local and international students. Again we only plot the first 30 top ones.
In [5]:
def uni_ratio_int_stu(df):
#Compute ratio
df['Ratio Int. students'] = df['# Int. students']/df['# Students']
#Sort the rows according to ratio and locate the top 30 ones
df_sorted_ratios = df.sort_values(by=['Ratio Int. students'], ascending=False).head(30)
#Plot the bar chart and print the best one
fig, ax = plt.subplots(figsize=(20,10))
plot = sns.barplot(y=df_sorted_ratios['Ratio Int. students'], x=df_sorted_ratios.index, ax=ax)
plot.set_xticklabels(plot.get_xticklabels(), rotation=90)
print("Best university according to ratio between local and international students is : " + df_sorted_ratios.iloc[0].name)
#Delete the ratio column of the dataframe
del df_sorted_ratios
In [6]:
uni_ratio_int_stu(df1)
Same observations as for the previous bar charts. It is good to note that the universities and their order are not the same as the previous ones.
Let's find the best country according to the ratio between faculty members and students. Previous assumptions about what is the best ratio still hold.
In [7]:
def country_ratio_fac_stu(df):
#Group by country and compute the ratios
df['Ratio'] = df['# Faculty members']/df['# Students']
#Sort the rows according to ratios
sorted_df = df.sort_values(by=['Ratio'], ascending=False)
#Plot the bar chart and print the best one
fig, ax = plt.subplots(figsize=(20,10))
plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
plot.set_xticklabels(plot.get_xticklabels(), rotation=50)
print("Best country according to ratio between faculty members and students is : " + sorted_df.iloc[0].name)
#Delete datas we don't need anymore
del df['Ratio']
del sorted_df
In [8]:
df1_country = df1.groupby(['Country']).agg(np.mean)
In [9]:
country_ratio_fac_stu(df1_country)
Russia is the exception with a high number in comparison of the others. Again the rest decreases in a linear fashion.
Let's find the best country according to the ratio between local and international students. Previous assumptions about what is the best ratio still hold.
In [10]:
def country_ratio_int_stu(df):
#Compute the ratios
df['Ratio'] = df['# Int. students']/df['# Students']
#Sort the rows according to ratios
sorted_df = df.sort_values(by=['Ratio'], ascending=False)
#Plot the bar chart and print the best one
fig, ax = plt.subplots(figsize=(20,10))
plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
plot.set_xticklabels(plot.get_xticklabels(), rotation=50)
print("Best country according to ratio between local and international students is : " + sorted_df.iloc[0].name)
#Delete datas we don't need anymore
del df
del sorted_df
In [11]:
country_ratio_int_stu(df1_country)
This bar chart is a bit different than the previous ones. Indeed the first ratios decrease way faster than the last ones but we could still use a combintion of two linear functions to approximate the decrease.
Let's find the best region according to the ratio between faculty members and students. Previous assumptions about what is the best ratio still hold.
In [12]:
def region_ratio_fac_stu(df):
#Group by region and compute the ratios
df['Ratio'] = df['# Faculty members']/df['# Students']
#Sort the rows according to ratios
sorted_df = df.sort_values(by=['Ratio'], ascending=False)
#Plot the bar chart and print the best one
fig, ax = plt.subplots(figsize=(20,10))
df = df.groupby(['Region']).agg(sum)
plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
plot.set_xticklabels(plot.get_xticklabels(), rotation=50)
print("Best region according to ratio between faculty members and students is : " + sorted_df.iloc[0].name)
#Delete datas we don't need anymore
del df['Ratio']
In [13]:
df1_region = df1.groupby(['Region']).agg(np.mean)
In [14]:
region_ratio_fac_stu(df1_region)
Let's find the best region according to the ratio between local and international students. Previous assumptions about what is the best ratio still hold.
In [15]:
def region_ratio_int_stu(df):
#Compute the ratios
df['Ratio'] = df['# Int. students']/df['# Students']
#Sort the rows according to ratios
sorted_df = df.sort_values(by=['Ratio'], ascending=False)
#Plot the bar chart and print the best one
fig, ax = plt.subplots(figsize=(20,10))
plot = sns.barplot(y=sorted_df.Ratio, x=sorted_df.index, ax=ax)
plot.set_xticklabels(plot.get_xticklabels(), rotation=50)
print("Best region according to ratio between local and international students is : " + sorted_df.iloc[0].name)
#Delete datas we don't need anymore
del df
del sorted_df
In [16]:
region_ratio_int_stu(df1_region)
Oceania is clearly the leader here.
Helper to get the region way more easily
In [17]:
countries = dict(df1[['Country', 'Region']].drop_duplicates().values.tolist())
countries['Luxembourg'] = 'Europe'
URL containing the requested data in json
In [18]:
timeshighereducation_url = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"
ranking_brute = requests.get(timeshighereducation_url).json()['data']
In [19]:
infos_to_keep = ["rank", "name", "location", "stats_number_students", "stats_pc_intl_students", "stats_student_staff_ratio"]
column_names = ["Rank", "University", "Country", "# Students", "% Int. students", "% Faculty members"]
In [20]:
# creating dataframe
df2 = pd.DataFrame(ranking_brute[:200], index=range(1, 201), columns=infos_to_keep)
# more expressive column names
df2.columns = column_names
df2["Rank"] = df2["Rank"].apply(lambda x: int(re.sub("[^0-9]", "", x)))
# string to numerical values
df2["# Students"] = pd.to_numeric(df2["# Students"].map(lambda x: x.replace(",", "")))
df2["% Int. students"] = pd.to_numeric(df2["% Int. students"].map(lambda x: x.replace("%", ".")))
df2["% Faculty members"] = pd.to_numeric(df2["% Faculty members"])
# adding number of international students
df2["# Int. students"] = (df2["# Students"] * df2["% Int. students"] / 100).map(round)
# adding number of faculty members
df2["# Faculty members"] = (df2["# Students"] / df2["% Faculty members"]).map(round)
# Changing Russian federation to Russia
df2['Country'].replace('Russian Federation', 'Russia', inplace=True)
# adding regions
df2["Region"] = df2["Country"].map(lambda c: countries[c])
# Usign meaningful index
df2.index = df2['University']
del df2['University']
# Deleting % Int. students because it is not needed anymore
del df2['% Int. students']
del df2["% Faculty members"]
Best university according to ratio between faculty members and students. We assume that a bigger ratio is better. Even though it might not be desirable, we assume it is the case for this exercise. Again we plots only the first 30 ones.
In [21]:
uni_ratio_fac_stu(df2)
One university is clearly above the others, then it decreases in a linear fashion.
Best university according to ratio between local and international students. We assume that a ratio of 1 is the best, that is the same number of local and international students.
In [22]:
uni_ratio_int_stu(df2)
We can see that LSE school has 70% of international students which is really impressive.
Let's find the best country according to 1) ratio between faculty members and students and 2) ratio between local and international students. Previous assumptions about what is the best ratio still hold.
In [23]:
df2_country = df2.groupby(['Country']).agg(np.mean)
In [24]:
country_ratio_fac_stu(df2_country)
In Denmark there are around 6 students for a staff member. It's clearly above the average.
In [25]:
country_ratio_int_stu(df2_country)
English talking countries are more international (in term of international students ratio) in general and also small european countries like Luxembourg (really impressive how high), Austria and Switzerland.
Let's find the best region according to 1) ratio between faculty members and students and 2) ratio between local and international students. Previous assumptions about what is the best ratio still hold.
In [26]:
df2_region = df2.groupby(['Region']).agg(np.mean)
In [27]:
region_ratio_fac_stu(df2_region)
It's quite suprising to see Africa first. We wouldn't have expect that at first. Although Europe is quite at the back.
In [28]:
region_ratio_int_stu(df2_region)
To combine the two rankings, we will be using the name of the universities. However, the names are rarely exactly the same in both rankings, which means that we need to use regex to keep the important part of the name only.
Sometimes regex will not be enough, and we will need to change the name in one of the dataframe manually.
We start by working on the cases where regex is not enough.
In [29]:
df2.rename({'ETH Zurich – Swiss Federal Institute of Technology Zurich' : 'ETH Zurich – Swiss Federal Institute of Technology',
'Wageningen University & Research' : 'Wageningen University'}, inplace=True)
df1.rename({'UCL (University College London)': 'University College London',
'KAIST - Korea Advanced Institute of Science & Technology' : 'Korea Advanced Institute of Science and Technology (KAIST)',
'Ludwig-Maximilians-Universität München' : 'LMU Munich',
'Ruprecht-Karls-Universität Heidelberg' : 'Heidelberg University',
'University of North Carolina, Chapel Hill' : 'University of North Carolina at Chapel Hill',
'Trinity College Dublin, The University of Dublin' : 'Trinity College Dublin',
'KIT, Karlsruhe Institute of Technology' : 'Karlsruhe Institute of Technology',
'Humboldt-Universität zu Berlin' : 'Humboldt University of Berlin',
'Freie Universitaet Berlin': 'Free University of Berlin',
'Université de Montréal' : 'University of Montreal',
'Université Pierre et Marie Curie (UPMC)' : 'Pierre and Marie Curie University',
'Technische Universität Berlin (TU Berlin)' : 'Technical University of Berlin',
'Universitat Autònoma de Barcelona' : 'Autonomous University of Barcelona',
'Eberhard Karls Universität Tübingen' : 'University of Tübingen',
'Albert-Ludwigs-Universitaet Freiburg' : 'University of Freiburg',
'Scuola Superiore Sant\'Anna Pisa di Studi Universitari e di Perfezionamento': 'Scuola Superiore Sant’Anna'}, inplace=True)
And then we use regex on every university name.
In [30]:
regex1 = r'\([^()]*\)'
regex2 = r"(the)|(university)|(of)|(de)|(-)|( )|(’)|(')|\."
p = re.compile('('+regex1+'|'+regex2+')')
for df in [df1, df2]:
df['University_regex'] = df.index.values
# We first take care of the case
df['University_regex'] = df['University_regex'].apply(lambda x: x.casefold())
# Then we remove useless words, spaces, parentheses, their content and so on
df['University_regex'] = df['University_regex'].apply(lambda x: str(re.sub(p, '', x)))
# We then normalize the text to remove accents
df['University_regex'] = df['University_regex'].apply(lambda x: unicodedata.normalize("NFKD", x))
# And swith the encoding to utf-8
df['University_regex'] = df['University_regex'].apply(lambda x: x.rstrip().encode('ascii', errors='ignore')
.decode('utf-8'))
Once we know that the data is ready to be merged, we apply an inner merge. We chose to use an inner merge instead of an outer merge because it makes more sense to only keep universities present in both rankings.
In [31]:
df_merged = df2.reset_index().merge(df1, how='inner', on='University_regex').set_index('University')
Once the merge is done, we select and clean the data we are interested in. Moreover, since the number of student is different depending on the ranking, we decided to keep the average.
In [32]:
#Rename columns and put them in a new dataframe
df_all = df_merged.copy()
df_all['# Students (topuni)'] = df_merged['# Students_x']
df_all['# Students (times)'] = df_merged['# Students_y']
df_all['# Int. students (topuni)'] = df_merged['# Int. students_x']
df_all['# Int. students (times)'] = df_merged['# Int. students_y']
df_all['# Faculty members (topuni)'] = df_merged['# Faculty members_x']
df_all['# Faculty members (times)'] = df_merged['# Faculty members_y']
df_all['Rank topuni'] = df_merged['Rank_x']
df_all['Rank times'] = df_merged['Rank_y']
df_all['Country'] = df_merged['Country_x']
df_all['Region'] = df_merged['Region_x']
df_all['Ratio Int. students (topuni)'] = df_merged['Ratio Int. students_x']
df_all['Ratio Int. students (times)'] = df_merged['Ratio Int. students_y']
df_all['Ratio Faculty members (topuni)'] = df_merged['Ratio Faculty members_x']
df_all['Ratio Faculty members (times)'] = df_merged['Ratio Faculty members_y']
#Columns to keep
df_all = df_all[['# Students (topuni)', '# Students (times)',
'# Int. students (topuni)', '# Int. students (times)', '# Faculty members (topuni)',
'# Faculty members (times)', 'Rank topuni', 'Rank times', 'Country', 'Region',
'Ratio Int. students (topuni)', 'Ratio Int. students (times)', 'Ratio Faculty members (topuni)', 'Ratio Faculty members (times)']]
del df_merged
Plot of the correlation
In [33]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(df_all.corr(), ax=ax, square=True)
Out[33]:
Above, a few things are interesting to notice:
Note that by 'absolute' correlation we mean the absolute value of the correlation
To find the best university according to both ranking, we could use the following method: We take the mean of both ranking and the university with the smallest mean is the best university. With this method we try to find the university closest to the top in both rankings.
In [34]:
df_all['Rank mean'] = (df_all['Rank times'] + df_all['Rank topuni'])/2.0
best = df_all.sort_values(by=['Rank mean'], ascending=True).iloc[0]
print("The best university according to both ranking is " + best.name + " with a mean rank score of {}".format(best["Rank mean"]))
del df_all['Rank mean']
In [35]:
pca = PCA(n_components=2)
# Creates pairs of ranks for each university
ranks = np.array([df_all[['Rank times']].values.flatten(), df_all[['Rank topuni']].values.flatten()]).T
# Tells the PCA what the data is and then get the data in its new basis
ranks_new_basis = pca.fit(ranks).transform(ranks)
# Get the ratio of importance for each component
pca_score = pca.explained_variance_ratio_
# Get the components
V = pca.components_
# Multiplies the components by their ratio of importance compared to the least important component
ranks_pca_axis_x, ranks_pca_axis_y = V.T * pca_score / pca_score.min()
We then plotted the different Universities according to their respective ranking and showed the first two main component of the data (green and red).
In [36]:
starting_point = (100, 100)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.title('Universities ranking')
plt.plot(df_all[['Rank times']].values.flatten(), df_all[['Rank topuni']].values.flatten(), '.')
plt.plot([starting_point[0], starting_point[0] + ranks_pca_axis_x[0]*20], [starting_point[1], starting_point[1] + ranks_pca_axis_y[0]*20], label='Main component')
plt.plot([starting_point[0], starting_point[0] + ranks_pca_axis_x[1]*20], [starting_point[1], starting_point[1] + ranks_pca_axis_y[1]*20], label='Second component')
plt.xlabel('Ranking according to Times')
plt.ylabel('Ranking according to Topuni')
plt.legend()
plt.show()
Once we have the principle component, we can use it to transform our data and change its basis from (rank times, rank topuni) to (main component, second component)
In [37]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.scatter(ranks_new_basis.T[0], ranks_new_basis.T[1])
# Creates main component arrow
plt.plot([-125, 150], [0, 0], c='g', label='Main component')
plt.plot([140, 150], [4, 0], c='g')
plt.plot([140, 150], [-4, 0], c='g')
plt.xlabel('Main component')
plt.ylabel('Second component')
plt.title('Universities ranking in new basis')
plt.legend()
plt.show()
To get a final ranking system, we simply project the points on the main component axis and rate them depending on how far left they are. The more on the left, the better the rank.
In [38]:
# Keep only main component value of data
ranks_pca1 = ranks_new_basis.T[0]
# Get indices of sorted data
temp = ranks_pca1.argsort()
# Creates new array and then aranges values depending on indices of sorted data
new_ranks = np.empty(len(ranks_pca1), int)
new_ranks[temp] = np.arange(len(ranks_pca1))
We now add this new rank in the DataFrame and show the 20 first universities in our new ranking system.
In [39]:
df_all['Rank total'] = new_ranks + 1
Best university according to both ranking :
In [40]:
print("Best university according to PCA is : " + df_all.sort_values(by='Rank total').iloc[0].name)
As we can see with both the basic and alternative method Stanford University is the best according to both ranking. Which is not surpising considering its excellent reputation.