In [1]:
import pandas as pd
import numpy as np
import requests as requests
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
r = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247')
soupe = BeautifulSoup(r.text, 'html.parser')

Collect the data

Get the name of the 4 fields we have to select


In [3]:
select = soupe.find_all('select')
select_name = [s.attrs['name'] for s in select]
select_name


Out[3]:
['ww_x_UNITE_ACAD',
 'ww_x_PERIODE_ACAD',
 'ww_x_PERIODE_PEDAGO',
 'ww_x_HIVERETE']

Get the select field correspondind to the 4 names found before


In [4]:
select_field = [soupe.find('select',{'name': name}) for name in select_name]

Get the value corresponding to the "Informatique"


In [5]:
option_unite_acad = select_field[0].find_all('option')
#option_unite_acad[[opt.text == 'Informatique' for opt in option_unite_acad]]
option_unite_acad
unite_acad ={opt['value']: opt.text for opt in option_unite_acad if opt.text == 'Informatique'}
 
unite_acad


Out[5]:
{'249847': 'Informatique'}

Get all the values of the academic period field

In the second select_Field, in the option tag, we take all value execept the one equal to null

We only keep the period that are bigger than 2007 (in case there were older periods)


In [6]:
option = select_field[1].find_all('option')
period_acad = {opt['value']: opt.text for opt in option if opt['value'] != 'null' and int(opt.text.split('-')[0]) >= 2007}
period_acad


Out[6]:
{'123455150': '2011-2012',
 '123456101': '2012-2013',
 '213637754': '2013-2014',
 '213637922': '2014-2015',
 '213638028': '2015-2016',
 '355925344': '2016-2017',
 '39486325': '2010-2011',
 '978181': '2007-2008',
 '978187': '2008-2009',
 '978195': '2009-2010'}

Get all the values of the pedagogic period field correspoding to the bachelor semester

in the 3rd select_field, we take all value that contains 'Bachelor' in the label

Since we need to find the first and last record of a student, we only consider the 1st, 5th and 6th semester.

It is not possible to finish his bachelor during the 2, 3 or 4 semester but it is possible to finish during the 5th semester if we miss some credits during our last year and we only need one semester to finish


In [7]:
option = select_field[2].find_all('option')

period_pedago = {opt['value']: opt.text for opt in option if 'Bachelor' in opt.text and ('1' in opt.text or '5' in opt.text or '6' in opt.text) }
period_pedago


Out[7]:
{'2226768': 'Bachelor semestre 5b',
 '2226785': 'Bachelor semestre 6b',
 '249108': 'Bachelor semestre 1',
 '942120': 'Bachelor semestre 5',
 '942175': 'Bachelor semestre 6'}

In [8]:
option = select_field[3].find_all('option')
hiverEte = {opt['value']: opt.text for opt in option if opt['value'] != 'null'}
hiverEte


Out[8]:
{'2936286': "Semestre d'automne", '2936295': 'Semestre de printemps'}

Collect the data

Create a function that will parse one request and return a dataFrame


In [9]:
def parseRequest(u_a, p_a, p_p, h_e):

    #Send request
    url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD='+u_a[0]+'&ww_x_PERIODE_ACAD='+p_a[0]+'&ww_x_PERIODE_PEDAGO='+p_p[0]+'&ww_x_HIVERETE='+ h_e
    r = requests.get(url)
    soupe = BeautifulSoup(r.text, 'html.parser')
    
    #get the header , we also replace the space by '_' (easier to use the dataframe later)  
    th_tag = soupe.find_all('th')
    th = [t.text.replace(' ', '_') for t in th_tag]
    #remove the first th that correspond to the title of the table
    th = th[1:]

    
    #save the size of the header
    header_size = len(th)
    #add new information (new columns): year_start, year_stop, semester number
    th = np.append(th, ['Year_start', 'Year_stop', 'Semester'])
    
    #Find all the 'tr' tag
    tr_tag = soupe.find_all('tr')
    #drop the 2 first tag that correspond to the title and the headers of the table
    tr_tag = tr_tag[2:]
    
    #Temporary dictionary that will collect all the entry of the dataframe
    data = []
    
    #Read the request line by line and fill the dataframe
    for tr in tr_tag:
        #create the new entry
        row = [r.text.replace('\xa0', ' ')  for r in tr]
        #one row contains 12 elements but the header has only 11-> drop the last one because it is always empty
        row = row[:header_size]

        ##add the  new information to the row
        #split the academic period
        year = p_a[1].split('-')
        #find the semester
        semester = p_p[1].split()[2]
        newCol = [int(year[0]), int(year[1]), semester]

        #concat the row with the new info
        row += newCol
        data.append(row)

    df = pd.DataFrame(data, columns= th)
    return df

We iterate over all the parameters. We decided to skip the 'Type de semestre' (HIVERETE) since it is a redundant information. An odd semester is always in Autumn and an even one is always in Spring


In [10]:
list_df = []

for u_a in unite_acad.items():
    for p_a in period_acad.items():
        for p_p in period_pedago.items():
                print('Request for: ',u_a[1], p_a[1],  p_p[1])
                list_df.append(parseRequest(u_a,p_a, p_p, 'null'))


Student = pd.concat(list_df, ignore_index=True)
Student


Request for:  Informatique 2014-2015 Bachelor semestre 5b
Request for:  Informatique 2014-2015 Bachelor semestre 6
Request for:  Informatique 2014-2015 Bachelor semestre 6b
Request for:  Informatique 2014-2015 Bachelor semestre 1
Request for:  Informatique 2014-2015 Bachelor semestre 5
Request for:  Informatique 2008-2009 Bachelor semestre 5b
Request for:  Informatique 2008-2009 Bachelor semestre 6
Request for:  Informatique 2008-2009 Bachelor semestre 6b
Request for:  Informatique 2008-2009 Bachelor semestre 1
Request for:  Informatique 2008-2009 Bachelor semestre 5
Request for:  Informatique 2015-2016 Bachelor semestre 5b
Request for:  Informatique 2015-2016 Bachelor semestre 6
Request for:  Informatique 2015-2016 Bachelor semestre 6b
Request for:  Informatique 2015-2016 Bachelor semestre 1
Request for:  Informatique 2015-2016 Bachelor semestre 5
Request for:  Informatique 2010-2011 Bachelor semestre 5b
Request for:  Informatique 2010-2011 Bachelor semestre 6
Request for:  Informatique 2010-2011 Bachelor semestre 6b
Request for:  Informatique 2010-2011 Bachelor semestre 1
Request for:  Informatique 2010-2011 Bachelor semestre 5
Request for:  Informatique 2013-2014 Bachelor semestre 5b
Request for:  Informatique 2013-2014 Bachelor semestre 6
Request for:  Informatique 2013-2014 Bachelor semestre 6b
Request for:  Informatique 2013-2014 Bachelor semestre 1
Request for:  Informatique 2013-2014 Bachelor semestre 5
Request for:  Informatique 2016-2017 Bachelor semestre 5b
Request for:  Informatique 2016-2017 Bachelor semestre 6
Request for:  Informatique 2016-2017 Bachelor semestre 6b
Request for:  Informatique 2016-2017 Bachelor semestre 1
Request for:  Informatique 2016-2017 Bachelor semestre 5
Request for:  Informatique 2007-2008 Bachelor semestre 5b
Request for:  Informatique 2007-2008 Bachelor semestre 6
Request for:  Informatique 2007-2008 Bachelor semestre 6b
Request for:  Informatique 2007-2008 Bachelor semestre 1
Request for:  Informatique 2007-2008 Bachelor semestre 5
Request for:  Informatique 2011-2012 Bachelor semestre 5b
Request for:  Informatique 2011-2012 Bachelor semestre 6
Request for:  Informatique 2011-2012 Bachelor semestre 6b
Request for:  Informatique 2011-2012 Bachelor semestre 1
Request for:  Informatique 2011-2012 Bachelor semestre 5
Request for:  Informatique 2009-2010 Bachelor semestre 5b
Request for:  Informatique 2009-2010 Bachelor semestre 6
Request for:  Informatique 2009-2010 Bachelor semestre 6b
Request for:  Informatique 2009-2010 Bachelor semestre 1
Request for:  Informatique 2009-2010 Bachelor semestre 5
Request for:  Informatique 2012-2013 Bachelor semestre 5b
Request for:  Informatique 2012-2013 Bachelor semestre 6
Request for:  Informatique 2012-2013 Bachelor semestre 6b
Request for:  Informatique 2012-2013 Bachelor semestre 1
Request for:  Informatique 2012-2013 Bachelor semestre 5
Out[10]:
Civilité Ecole_Echange Filière_opt. Mineur No_Sciper Nom_Prénom Orientation_Bachelor Orientation_Master Semester Spécialisation Statut Type_Echange Year_start Year_stop
0 Madame University of Bristol 225654 Aeby Prisca 6 Congé Erasmus 2014.0 2015.0
1 Monsieur 202293 Aiulfi Loris Sandro 6 Présent 2014.0 2015.0
2 Monsieur 215576 Alonso Seisdedos Florian 6 Présent 2014.0 2015.0
3 Monsieur 213618 Amorim Afonso Caldeira Da Silva Pedro Maria 6 Présent 2014.0 2015.0
4 Monsieur 215623 Andreina Sébastien Laurent 6 Présent 2014.0 2015.0
5 Monsieur 212464 Angerand Grégoire Georges Jacques 6 Présent 2014.0 2015.0
6 Monsieur Carnegie Mellon University Pittsburgh 223410 Balle Daniel 6 Congé Bilatéral 2014.0 2015.0
7 Monsieur 215625 Barthe Sidney 6 Présent 2014.0 2015.0
8 Monsieur 212591 Beaud Guillaume François Paul 6 Congé 2014.0 2015.0
9 Monsieur 226638 Beguet Romain Michel 6 Présent 2014.0 2015.0
10 Monsieur Royal Institute of Technology, (KTH) Stockholm 226703 Bertrand Christophe 6 Congé Erasmus 2014.0 2015.0
11 Monsieur Chalmers University of Technology, Göteborg 202704 Billardon Baptiste Olivier Jacques 6 Congé Erasmus 2014.0 2015.0
12 Monsieur 224627 Bloch Aurélien François Gilbert 6 Présent 2014.0 2015.0
13 Monsieur 223606 Bosson Gaylor 6 Présent 2014.0 2015.0
14 Monsieur 212480 Bouvier Ogier Quentin 6 Présent 2014.0 2015.0
15 Monsieur 227209 Brousse Cyriaque Gilles Guillaume 6 Présent 2014.0 2015.0
16 Monsieur 202973 Cartier Alexis Victor Xavier 6 Présent 2014.0 2015.0
17 Monsieur 217548 Cattin Johan 6 Présent 2014.0 2015.0
18 Monsieur Eidgenössische Technische Hochschule Zürich 225203 Chatelain Bastien Ludovic 6 Congé Mobilité en Suisse 2014.0 2015.0
19 Monsieur 225483 Cook Cedric Radboud Ernest 6 Présent 2014.0 2015.0
20 Monsieur 223566 Cotofrei Radu-Mihai 6 Présent 2014.0 2015.0
21 Monsieur Royal Institute of Technology, (KTH) Stockholm 229467 De Vecchi Guillaume Paul Louis 6 Congé Erasmus 2014.0 2015.0
22 Monsieur 224704 Débieux Vincent 6 Présent 2014.0 2015.0
23 Monsieur Ecole Polytechnique de Montréal 225258 Desplaces Pierre-Antoine 6 Congé Bilatéral 2014.0 2015.0
24 Monsieur 213027 Devillard Florent 6 Présent 2014.0 2015.0
25 Monsieur 201253 Drougard Malcom Malo 6 Congé 2014.0 2015.0
26 Madame 194836 Duré Laurence 6 Congé 2014.0 2015.0
27 Monsieur 205727 Farine Aurélien 6 Présent 2014.0 2015.0
28 Monsieur 226647 Favre Léonard Michel 6 Présent 2014.0 2015.0
29 Monsieur Johns Hopkins University, Baltimore 216451 Fiszel Ruben Leo 6 Congé Bilatéral 2014.0 2015.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3244 Monsieur 196118 Ouwehand Frédéric Jan 5 Présent 2012.0 2013.0
3245 Monsieur 194977 Pagand Jordane 5 Présent 2012.0 2013.0
3246 Monsieur 196696 Pellet Marc 5 Présent 2012.0 2013.0
3247 Monsieur 207028 Pittet Valérian Thibault 5 Présent 2012.0 2013.0
3248 Monsieur 195290 Prakash Aman 5 Présent 2012.0 2013.0
3249 Monsieur 194171 Quarta Jonny 5 Présent 2012.0 2013.0
3250 Monsieur 205804 Reber Sven 5 Présent 2012.0 2013.0
3251 Monsieur 192687 Reis Rui 5 Présent 2012.0 2013.0
3252 Monsieur 205814 Rensch Colla Davóg Ross 5 Présent 2012.0 2013.0
3253 Monsieur 195950 Rieder Jeremy 5 Présent 2012.0 2013.0
3254 Monsieur 186870 Rolland Cédric 5 Présent 2012.0 2013.0
3255 Madame 184956 Rollier Orianne 5 Présent 2012.0 2013.0
3256 Monsieur 192674 Rossier Joël 5 Présent 2012.0 2013.0
3257 Monsieur 192304 Sandoz David 5 Présent 2012.0 2013.0
3258 Monsieur 195899 Scarnera Gianni 5 Présent 2012.0 2013.0
3259 Monsieur 195675 Schegg Elias 5 Présent 2012.0 2013.0
3260 Monsieur 202373 Schmutz Michaël Steven 5 Présent 2012.0 2013.0
3261 Monsieur 195446 Simond Florian 5 Présent 2012.0 2013.0
3262 Madame 184373 Sisto Maria 5 Présent 2012.0 2013.0
3263 Monsieur 193832 Szabo Kristof Tamas 5 Présent 2012.0 2013.0
3264 Monsieur 194603 Toumi Aymen 5 Présent 2012.0 2013.0
3265 Monsieur 202508 Vessaz Florian 5 Présent 2012.0 2013.0
3266 Monsieur 201229 Villa David 5 Présent 2012.0 2013.0
3267 Monsieur 175410 Vion Roger 5 Présent 2012.0 2013.0
3268 Monsieur 193558 Vokatch-Boldyrev Igor 5 Présent 2012.0 2013.0
3269 Monsieur 198216 Von Aarburg Raphaël Richard 5 Présent 2012.0 2013.0
3270 Monsieur 196683 Vostriakov Alexander 5 Présent 2012.0 2013.0
3271 Monsieur 184863 Wenger Amos 5 Présent 2012.0 2013.0
3272 Monsieur 184533 Zellweger Vincent 5 Présent 2012.0 2013.0
3273 Monsieur University of Waterloo 203457 Zommerfelds Christian Georges 5 Congé Bilatéral 2012.0 2013.0

3274 rows × 14 columns

How many years it took each student to go from the first to the sixth semester

As said before, here we check student that are in semester 1 (beginning) and semester 6 or 5 (in case they did the bachelor in 3.5 or 4.5 year)


In [11]:
Student.index = Student.No_Sciper + Student.Semester.astype(str) + Student.Year_start.astype(str)
Student.index.is_unique


Out[11]:
True

Show total number of student that made at least one semester


In [12]:
len(Student.No_Sciper.unique())


Out[12]:
1466

Eliminate student who don't finished their studies

We group by sciper number (which we now is unique for each student). It return a sciper with a dataframe containing all the entries for one student

We keep people that appear in semester 1, 5 and 6. => those are the people that graduated in informatique

We drop all other people because:

  • if they don't appear in semester 6 it means they never finished the Bachelor
  • if they appear only in semester 5 and 6 it means that they began in another section (usually in communication system), but we can't know when they began epfl without loading the data for all sections

But just to have an idea, we keep the person who only take part to semester 5 and 6, just to see the proportion


In [13]:
def computeTotalYears(df):
    start = df.Year_start.min()
    end = df.Year_stop.max()
    end_semester = df[df.Year_stop == end].Semester
    if(end_semester == '6').any():
        return (int(end) - int(start)) 
    else: 
        return (int(end) - int(start) -0.5)

In [14]:
Student_copy = Student.copy()
Student_copy.index = Student.index


#We init the dataframe
#store people that complete the 3 years in informatique
Bachelor = pd.DataFrame(columns = ['Sciper', 'Civilité', 'Years'])
#store people that complet only the 2 last years
Only_5_6 = pd.DataFrame(columns = ['Sciper', 'Civilité', 'Years'])

#Groupe by sciper
grouped = Student_copy.groupby(['No_Sciper'])



for scip, group in grouped:
    if((group.Semester != '1').all() and (group.Semester == '5').any() and (group.Semester == '6').any()): 
        total = computeTotalYears(group)
        Only_5_6.ix[scip] = [scip,group.Civilité.iloc[0] , total  ]
    elif((group.Semester == '1').any() and (group.Semester == '5').any() and (group.Semester == '6').any()):
        total = computeTotalYears(group)
        Bachelor.ix[scip] = [scip,group.Civilité.iloc[0] , total ]

In [15]:
Bachelor.Years.max()


Out[15]:
7.0

In [16]:
Bachelor.Years.min()


Out[16]:
3.0

In [17]:
Bachelor.head()


Out[17]:
Sciper Civilité Years
147008 147008 Monsieur 3.0
169569 169569 Monsieur 3.0
169731 169731 Monsieur 4.0
169795 169795 Monsieur 4.5
171195 171195 Monsieur 3.0

Person that didn't complete the first year in compute Science, we don't consider them since we can't know when they begin their first year


In [18]:
Only_5_6.count()


Out[18]:
Sciper      119
Civilité    119
Years       119
dtype: int64

Nomber of person that complete the bachelor in computer science


In [19]:
Bachelor.count()


Out[19]:
Sciper      397
Civilité    397
Years       397
dtype: int64

Number of person that tried at least the first years or last one


In [20]:
len(grouped)


Out[20]:
1466

Person that tried the first year but never finished the bachelor


In [21]:
len(grouped) - len(Bachelor) - len(Only_5_6)


Out[21]:
950

Compute the average time (in years) to complete the bachelor

we choose to ouptut the result in years since it is more significant for human than month. To have the number of months we just need to multiply by 12

In total


In [22]:
len(Bachelor)


Out[22]:
397

In [23]:
average = Bachelor.Years.sum()/len(Bachelor)
average


Out[23]:
3.5604534005037785

In [24]:
Bachelor.Years.max()


Out[24]:
7.0

In [25]:
Bachelor.Years.min()


Out[25]:
3.0

In [26]:
Bachelor.Years.hist(bins = 10, range=[3, 8])


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x1174477f0>

Female


In [27]:
Female = Bachelor[Bachelor.Civilité == 'Madame']
len(Female)


Out[27]:
29

In [28]:
averageFemale = Female.Years.sum()/len(Female)
averageFemale


Out[28]:
3.396551724137931

In [29]:
Female.Years.hist(bins = 10, range=[3, 8])


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x116fef278>

Male


In [30]:
Male = Bachelor[Bachelor.Civilité == 'Monsieur']
len(Male)


Out[30]:
368

In [31]:
average = Male.Years.sum()/len(Male)
average


Out[31]:
3.573369565217391

In [32]:
Male.Years.hist(bins = 10, range=[3, 8])


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x116f0a4a8>

Test the results


In [33]:
import scipy.stats as stats

We want to see if the difference of the average years for female and male are statistically significant with a threshold of 95%

We use a Welch's T-Test (which does not assume equal population variance): it measures whether the average value differs significantly across samples.


In [34]:
stats.ttest_ind(a = Female.Years, b= Male.Years, equal_var=False)


Out[34]:
Ttest_indResult(statistic=-1.3437005678090845, pvalue=0.18785555340784144)

Since the pvalue is > 0.05, we cannot reject the null hypothesis of identical average scores which means: we cannot say that the difference is in average statistically significant


In [ ]: