In [1]:
import pandas as pd
import numpy as np
import requests as requests
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.mode.chained_assignment = None # default='warn'
In [2]:
r = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247')
soupe = BeautifulSoup(r.text, 'html.parser')
In [3]:
select = soupe.find_all('select')
select_name = [s.attrs['name'] for s in select]
select_name
Out[3]:
In [4]:
select_field = [soupe.find('select',{'name': name}) for name in select_name]
In [5]:
option_unite_acad = select_field[0].find_all('option')
#option_unite_acad[[opt.text == 'Informatique' for opt in option_unite_acad]]
option_unite_acad
unite_acad ={opt['value']: opt.text for opt in option_unite_acad if opt.text == 'Informatique'}
unite_acad
Out[5]:
In [6]:
option = select_field[1].find_all('option')
period_acad = {opt['value']: opt.text for opt in option if opt['value'] != 'null' and int(opt.text.split('-')[0]) >= 2007}
period_acad
Out[6]:
in the 3rd select_field, we take all value that contains 'Bachelor' in the label
Since we need to find the first and last record of a student, we only consider the 1st, 5th and 6th semester.
It is not possible to finish his bachelor during the 2, 3 or 4 semester but it is possible to finish during the 5th semester if we miss some credits during our last year and we only need one semester to finish
In [7]:
option = select_field[2].find_all('option')
period_pedago = {opt['value']: opt.text for opt in option if 'Bachelor' in opt.text and ('1' in opt.text or '5' in opt.text or '6' in opt.text) }
period_pedago
Out[7]:
In [8]:
option = select_field[3].find_all('option')
hiverEte = {opt['value']: opt.text for opt in option if opt['value'] != 'null'}
hiverEte
Out[8]:
Create a function that will parse one request and return a dataFrame
In [9]:
def parseRequest(u_a, p_a, p_p, h_e):
#Send request
url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD='+u_a[0]+'&ww_x_PERIODE_ACAD='+p_a[0]+'&ww_x_PERIODE_PEDAGO='+p_p[0]+'&ww_x_HIVERETE='+ h_e
r = requests.get(url)
soupe = BeautifulSoup(r.text, 'html.parser')
#get the header , we also replace the space by '_' (easier to use the dataframe later)
th_tag = soupe.find_all('th')
th = [t.text.replace(' ', '_') for t in th_tag]
#remove the first th that correspond to the title of the table
th = th[1:]
#save the size of the header
header_size = len(th)
#add new information (new columns): year_start, year_stop, semester number
th = np.append(th, ['Year_start', 'Year_stop', 'Semester'])
#Find all the 'tr' tag
tr_tag = soupe.find_all('tr')
#drop the 2 first tag that correspond to the title and the headers of the table
tr_tag = tr_tag[2:]
#Temporary dictionary that will collect all the entry of the dataframe
data = []
#Read the request line by line and fill the dataframe
for tr in tr_tag:
#create the new entry
row = [r.text.replace('\xa0', ' ') for r in tr]
#one row contains 12 elements but the header has only 11-> drop the last one because it is always empty
row = row[:header_size]
##add the new information to the row
#split the academic period
year = p_a[1].split('-')
#find the semester
semester = p_p[1].split()[2]
newCol = [int(year[0]), int(year[1]), semester]
#concat the row with the new info
row += newCol
data.append(row)
df = pd.DataFrame(data, columns= th)
return df
We iterate over all the parameters. We decided to skip the 'Type de semestre' (HIVERETE) since it is a redundant information. An odd semester is always in Autumn and an even one is always in Spring
In [10]:
list_df = []
for u_a in unite_acad.items():
for p_a in period_acad.items():
for p_p in period_pedago.items():
print('Request for: ',u_a[1], p_a[1], p_p[1])
list_df.append(parseRequest(u_a,p_a, p_p, 'null'))
Student = pd.concat(list_df, ignore_index=True)
Student
Out[10]:
In [11]:
Student.index = Student.No_Sciper + Student.Semester.astype(str) + Student.Year_start.astype(str)
Student.index.is_unique
Out[11]:
Show total number of student that made at least one semester
In [12]:
len(Student.No_Sciper.unique())
Out[12]:
We group by sciper number (which we now is unique for each student). It return a sciper with a dataframe containing all the entries for one student
We keep people that appear in semester 1, 5 and 6. => those are the people that graduated in informatique
We drop all other people because:
But just to have an idea, we keep the person who only take part to semester 5 and 6, just to see the proportion
In [13]:
def computeTotalYears(df):
start = df.Year_start.min()
end = df.Year_stop.max()
end_semester = df[df.Year_stop == end].Semester
if(end_semester == '6').any():
return (int(end) - int(start))
else:
return (int(end) - int(start) -0.5)
In [14]:
Student_copy = Student.copy()
Student_copy.index = Student.index
#We init the dataframe
#store people that complete the 3 years in informatique
Bachelor = pd.DataFrame(columns = ['Sciper', 'Civilité', 'Years'])
#store people that complet only the 2 last years
Only_5_6 = pd.DataFrame(columns = ['Sciper', 'Civilité', 'Years'])
#Groupe by sciper
grouped = Student_copy.groupby(['No_Sciper'])
for scip, group in grouped:
if((group.Semester != '1').all() and (group.Semester == '5').any() and (group.Semester == '6').any()):
total = computeTotalYears(group)
Only_5_6.ix[scip] = [scip,group.Civilité.iloc[0] , total ]
elif((group.Semester == '1').any() and (group.Semester == '5').any() and (group.Semester == '6').any()):
total = computeTotalYears(group)
Bachelor.ix[scip] = [scip,group.Civilité.iloc[0] , total ]
In [15]:
Bachelor.Years.max()
Out[15]:
In [16]:
Bachelor.Years.min()
Out[16]:
In [17]:
Bachelor.head()
Out[17]:
Person that didn't complete the first year in compute Science, we don't consider them since we can't know when they begin their first year
In [18]:
Only_5_6.count()
Out[18]:
Nomber of person that complete the bachelor in computer science
In [19]:
Bachelor.count()
Out[19]:
Number of person that tried at least the first years or last one
In [20]:
len(grouped)
Out[20]:
Person that tried the first year but never finished the bachelor
In [21]:
len(grouped) - len(Bachelor) - len(Only_5_6)
Out[21]:
In [22]:
len(Bachelor)
Out[22]:
In [23]:
average = Bachelor.Years.sum()/len(Bachelor)
average
Out[23]:
In [24]:
Bachelor.Years.max()
Out[24]:
In [25]:
Bachelor.Years.min()
Out[25]:
In [26]:
Bachelor.Years.hist(bins = 10, range=[3, 8])
Out[26]:
In [27]:
Female = Bachelor[Bachelor.Civilité == 'Madame']
len(Female)
Out[27]:
In [28]:
averageFemale = Female.Years.sum()/len(Female)
averageFemale
Out[28]:
In [29]:
Female.Years.hist(bins = 10, range=[3, 8])
Out[29]:
In [30]:
Male = Bachelor[Bachelor.Civilité == 'Monsieur']
len(Male)
Out[30]:
In [31]:
average = Male.Years.sum()/len(Male)
average
Out[31]:
In [32]:
Male.Years.hist(bins = 10, range=[3, 8])
Out[32]:
In [33]:
import scipy.stats as stats
We want to see if the difference of the average years for female and male are statistically significant with a threshold of 95%
We use a Welch's T-Test (which does not assume equal population variance): it measures whether the average value differs significantly across samples.
In [34]:
stats.ttest_ind(a = Female.Years, b= Male.Years, equal_var=False)
Out[34]:
Since the pvalue is > 0.05, we cannot reject the null hypothesis of identical average scores which means: we cannot say that the difference is in average statistically significant
In [ ]: