In [1]:

    
# imports (may be more than needed)
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import itertools
%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'

Master Data

Obtain all the data for the Bachelor students, starting from 2007. Keep only the students for which you have an entry for both Bachelor semestre 1 and Bachelor semestre 6. Compute how many months it took each student to go from the first to the sixth semester. Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?

2) Perform a similar operation to what described above, this time for Master students. Notice that this data is more tricky, as there are many missing records in the IS-Academia database. Therefore, try to guess how much time a master student spent at EPFL by at least checking the distance in months between Master semestre 1 and Master semestre 2. If the Mineur field is not empty, the student should also appear registered in Master semestre 3. Last but not the least, don't forget to check if the student has an entry also in the Projet Master tables. Once you can handle well this data, compute the "average stay at EPFL" for master students. Now extract all the students with a Spécialisation and compute the "average stay" per each category of that attribute -- compared to the general average, can you find any specialization for which the difference in average is statistically significant?

Read the data from csv



In [2]:

    
all_data = pd.read_csv('all_data.csv', usecols=['Civilité', 'Nom_Prénom', 'title', 'periode_acad', 'periode_pedago','Orientation_Master', 'Spécialisation', 'Filière_opt.', 'Mineur', 'Statut', 'Type_Echange', 'Ecole_Echange', 'No_Sciper'])



In [3]:

    
all_data.sort_values(by='No_Sciper', axis=0).head(10)









    Out[3]:






  
    
      
      Civilité
      Nom_Prénom
      Orientation_Master
      Spécialisation
      Filière_opt.
      Mineur
      Statut
      Type_Echange
      Ecole_Echange
      No_Sciper
      title
      periode_acad
      periode_pedago
    
  
  
    
      7567
      Monsieur
      Martin Damien
      NaN
      NaN
      NaN
      NaN
      Présent
      NaN
      NaN
      121367
      Passerelle HES - IN, 2016-2017, Semestre automne
      2016-2017
      Semestre automne
    
    
      8828
      Monsieur
      Gulati Asheesh
      NaN
      Internet computing
      NaN
      NaN
      Présent
      NaN
      NaN
      128911
      Informatique, 2007-2008, Master semestre 3
      2007-2008
      Master semestre 3
    
    
      8602
      Monsieur
      Gulati Asheesh
      NaN
      Internet computing
      NaN
      NaN
      Présent
      NaN
      NaN
      128911
      Informatique, 2007-2008, Master semestre 2
      2007-2008
      Master semestre 2
    
    
      8870
      Monsieur
      Zhou Maoan
      NaN
      NaN
      NaN
      NaN
      Stage
      NaN
      NaN
      129093
      Informatique, 2007-2008, Master semestre 3
      2007-2008
      Master semestre 3
    
    
      8654
      Monsieur
      Zhou Maoan
      NaN
      NaN
      NaN
      NaN
      Présent
      NaN
      NaN
      129093
      Informatique, 2007-2008, Master semestre 2
      2007-2008
      Master semestre 2
    
    
      7563
      Monsieur
      Essellak Radouane
      NaN
      NaN
      NaN
      NaN
      Présent
      NaN
      NaN
      129094
      Passerelle HES - IN, 2016-2017, Semestre automne
      2016-2017
      Semestre automne
    
    
      8848
      Monsieur
      Ni Zhong Zhong
      NaN
      NaN
      NaN
      NaN
      Présent
      NaN
      NaN
      129326
      Informatique, 2007-2008, Master semestre 3
      2007-2008
      Master semestre 3
    
    
      8631
      Monsieur
      Ni Zhong Zhong
      NaN
      NaN
      NaN
      NaN
      Présent
      NaN
      NaN
      129326
      Informatique, 2007-2008, Master semestre 2
      2007-2008
      Master semestre 2
    
    
      8407
      Monsieur
      Droz-dit-Busset Arnault
      NaN
      NaN
      NaN
      NaN
      Présent
      NaN
      NaN
      138088
      Informatique, 2007-2008, Master semestre 1
      2007-2008
      Master semestre 1
    
    
      8813
      Monsieur
      Clivaz Jean-Philippe
      NaN
      NaN
      NaN
      NaN
      Présent
      NaN
      NaN
      145546
      Informatique, 2007-2008, Master semestre 3
      2007-2008
      Master semestre 3



In [4]:

    
len(all_data)









    Out[4]:





8942

Clean the data



In [5]:

    
all_data['periode_pedago'].unique()









    Out[5]:





array(['Bachelor semestre 5', 'Master semestre 1', 'Bachelor semestre 2',
       'Bachelor semestre 6', 'Master semestre 2',
       'Projet Master printemps', 'Bachelor semestre 3',
       'Bachelor semestre 1', 'Master semestre 3', 'Semestre printemps',
       'Projet Master automne', 'Semestre automne', 'Bachelor semestre 4'], dtype=object)



In [6]:

    
all_data['title'].unique()









    Out[6]:





array(['Informatique, 2009-2010, Bachelor semestre 5',
       'Informatique, 2009-2010, Master semestre 1',
       'Informatique, 2009-2010, Bachelor semestre 2',
       'Informatique, 2009-2010, Bachelor semestre 6',
       'Informatique, 2009-2010, Master semestre 2',
       'Informatique, 2009-2010, Projet Master printemps',
       'Informatique, 2009-2010, Bachelor semestre 3',
       'Informatique, 2009-2010, Bachelor semestre 1',
       'Informatique, 2009-2010, Master semestre 3',
       'Passerelle HES - IN, 2009-2010, Semestre printemps',
       'Informatique, 2009-2010, Projet Master automne',
       'Passerelle HES - IN, 2009-2010, Semestre automne',
       'Echange IN, 2009-2010, Semestre automne',
       'Informatique, 2009-2010, Bachelor semestre 4',
       'Informatique, 2014-2015, Bachelor semestre 5',
       'Informatique, 2014-2015, Master semestre 1',
       'Informatique, 2014-2015, Bachelor semestre 2',
       'Informatique, 2014-2015, Bachelor semestre 6',
       'Informatique, 2014-2015, Master semestre 2',
       'Informatique, 2014-2015, Bachelor semestre 3',
       'Informatique, 2014-2015, Bachelor semestre 1',
       'Informatique, 2014-2015, Master semestre 3',
       'Passerelle HES - IN, 2014-2015, Semestre printemps',
       'Echange IN, 2014-2015, Semestre printemps',
       'Informatique, 2014-2015, Projet Master automne',
       'Passerelle HES - IN, 2014-2015, Semestre automne',
       'Echange IN, 2014-2015, Semestre automne',
       'Informatique, 2014-2015, Bachelor semestre 4',
       'Informatique, 2012-2013, Bachelor semestre 5',
       'Informatique, 2012-2013, Master semestre 1',
       'Informatique, 2012-2013, Bachelor semestre 2',
       'Informatique, 2012-2013, Bachelor semestre 6',
       'Informatique, 2012-2013, Master semestre 2',
       'Informatique, 2012-2013, Projet Master printemps',
       'Informatique, 2012-2013, Bachelor semestre 3',
       'Informatique, 2012-2013, Bachelor semestre 1',
       'Informatique, 2012-2013, Master semestre 3',
       'Passerelle HES - IN, 2012-2013, Semestre printemps',
       'Informatique, 2012-2013, Projet Master automne',
       'Passerelle HES - IN, 2012-2013, Semestre automne',
       'Echange IN, 2012-2013, Semestre automne',
       'Informatique, 2012-2013, Bachelor semestre 4',
       'Informatique, 2008-2009, Bachelor semestre 5',
       'Informatique, 2008-2009, Master semestre 1',
       'Informatique, 2008-2009, Bachelor semestre 2',
       'Informatique, 2008-2009, Bachelor semestre 6',
       'Informatique, 2008-2009, Master semestre 2',
       'Informatique, 2008-2009, Projet Master printemps',
       'Passerelle HES - IN, 2008-2009, Bachelor semestre 3',
       'Informatique, 2008-2009, Bachelor semestre 3',
       'Informatique, 2008-2009, Bachelor semestre 1',
       'Informatique, 2008-2009, Master semestre 3',
       'Echange IN, 2008-2009, Semestre printemps',
       'Informatique, 2008-2009, Projet Master automne',
       'Echange IN, 2008-2009, Semestre automne',
       'Informatique, 2008-2009, Bachelor semestre 4',
       'Passerelle HES - IN, 2008-2009, Bachelor semestre 4',
       'Informatique, 2015-2016, Bachelor semestre 5',
       'Informatique, 2015-2016, Master semestre 1',
       'Informatique, 2015-2016, Bachelor semestre 2',
       'Informatique, 2015-2016, Bachelor semestre 6',
       'Informatique, 2015-2016, Master semestre 2',
       'Informatique, 2015-2016, Bachelor semestre 3',
       'Informatique, 2015-2016, Bachelor semestre 1',
       'Informatique, 2015-2016, Master semestre 3',
       'Passerelle HES - IN, 2015-2016, Semestre printemps',
       'Echange IN, 2015-2016, Semestre printemps',
       'Informatique, 2015-2016, Projet Master automne',
       'Echange IN, 2015-2016, Semestre automne',
       'Passerelle HES - IN, 2015-2016, Semestre automne',
       'Informatique, 2015-2016, Bachelor semestre 4',
       'Informatique, 2011-2012, Bachelor semestre 5',
       'Informatique, 2011-2012, Master semestre 1',
       'Informatique, 2011-2012, Bachelor semestre 2',
       'Informatique, 2011-2012, Bachelor semestre 6',
       'Informatique, 2011-2012, Master semestre 2',
       'Informatique, 2011-2012, Projet Master printemps',
       'Informatique, 2011-2012, Bachelor semestre 3',
       'Informatique, 2011-2012, Bachelor semestre 1',
       'Informatique, 2011-2012, Master semestre 3',
       'Passerelle HES - IN, 2011-2012, Semestre printemps',
       'Echange IN, 2011-2012, Semestre printemps',
       'Informatique, 2011-2012, Projet Master automne',
       'Passerelle HES - IN, 2011-2012, Semestre automne',
       'Echange IN, 2011-2012, Semestre automne',
       'Informatique, 2011-2012, Bachelor semestre 4',
       'Informatique, 2013-2014, Bachelor semestre 5',
       'Informatique, 2013-2014, Master semestre 1',
       'Informatique, 2013-2014, Bachelor semestre 2',
       'Informatique, 2013-2014, Bachelor semestre 6',
       'Informatique, 2013-2014, Master semestre 2',
       'Informatique, 2013-2014, Projet Master printemps',
       'Informatique, 2013-2014, Bachelor semestre 3',
       'Informatique, 2013-2014, Bachelor semestre 1',
       'Informatique, 2013-2014, Master semestre 3',
       'Passerelle HES - IN, 2013-2014, Semestre printemps',
       'Echange IN, 2013-2014, Semestre printemps',
       'Informatique, 2013-2014, Projet Master automne',
       'Echange IN, 2013-2014, Semestre automne',
       'Passerelle HES - IN, 2013-2014, Semestre automne',
       'Informatique, 2013-2014, Bachelor semestre 4',
       'Informatique, 2016-2017, Bachelor semestre 5',
       'Informatique, 2016-2017, Master semestre 1',
       'Informatique, 2016-2017, Bachelor semestre 2',
       'Informatique, 2016-2017, Bachelor semestre 6',
       'Informatique, 2016-2017, Master semestre 2',
       'Informatique, 2016-2017, Bachelor semestre 3',
       'Informatique, 2016-2017, Bachelor semestre 1',
       'Informatique, 2016-2017, Master semestre 3',
       'Echange IN, 2016-2017, Semestre printemps',
       'Informatique, 2016-2017, Projet Master automne',
       'Echange IN, 2016-2017, Semestre automne',
       'Passerelle HES - IN, 2016-2017, Semestre automne',
       'Informatique, 2016-2017, Bachelor semestre 4',
       'Informatique, 2010-2011, Bachelor semestre 5',
       'Informatique, 2010-2011, Master semestre 1',
       'Informatique, 2010-2011, Bachelor semestre 2',
       'Informatique, 2010-2011, Bachelor semestre 6',
       'Informatique, 2010-2011, Master semestre 2',
       'Informatique, 2010-2011, Projet Master printemps',
       'Informatique, 2010-2011, Bachelor semestre 3',
       'Informatique, 2010-2011, Bachelor semestre 1',
       'Informatique, 2010-2011, Master semestre 3',
       'Echange IN, 2010-2011, Semestre printemps',
       'Informatique, 2010-2011, Projet Master automne',
       'Echange IN, 2010-2011, Semestre automne',
       'Passerelle HES - IN, 2010-2011, Semestre automne',
       'Informatique, 2010-2011, Bachelor semestre 4',
       'Informatique, 2007-2008, Bachelor semestre 5',
       'Informatique, 2007-2008, Master semestre 1',
       'Informatique, 2007-2008, Bachelor semestre 2',
       'Informatique, 2007-2008, Bachelor semestre 6',
       'Informatique, 2007-2008, Master semestre 2',
       'Informatique, 2007-2008, Projet Master printemps',
       'Informatique, 2007-2008, Bachelor semestre 3',
       'Passerelle HES - IN, 2007-2008, Bachelor semestre 3',
       'Informatique, 2007-2008, Bachelor semestre 1',
       'Informatique, 2007-2008, Master semestre 3',
       'Echange IN, 2007-2008, Semestre printemps',
       'Informatique, 2007-2008, Projet Master automne',
       'Echange IN, 2007-2008, Semestre automne',
       'Passerelle HES - IN, 2007-2008, Bachelor semestre 4',
       'Informatique, 2007-2008, Bachelor semestre 4'], dtype=object)

checkout what the Semester printemps and automne are



In [7]:

    
all_data[all_data['periode_pedago'].isin(['Semestre printemps', 'Semestre automne'])]['title'].unique()









    Out[7]:





array(['Passerelle HES - IN, 2009-2010, Semestre printemps',
       'Passerelle HES - IN, 2009-2010, Semestre automne',
       'Echange IN, 2009-2010, Semestre automne',
       'Passerelle HES - IN, 2014-2015, Semestre printemps',
       'Echange IN, 2014-2015, Semestre printemps',
       'Passerelle HES - IN, 2014-2015, Semestre automne',
       'Echange IN, 2014-2015, Semestre automne',
       'Passerelle HES - IN, 2012-2013, Semestre printemps',
       'Passerelle HES - IN, 2012-2013, Semestre automne',
       'Echange IN, 2012-2013, Semestre automne',
       'Echange IN, 2008-2009, Semestre printemps',
       'Echange IN, 2008-2009, Semestre automne',
       'Passerelle HES - IN, 2015-2016, Semestre printemps',
       'Echange IN, 2015-2016, Semestre printemps',
       'Echange IN, 2015-2016, Semestre automne',
       'Passerelle HES - IN, 2015-2016, Semestre automne',
       'Passerelle HES - IN, 2011-2012, Semestre printemps',
       'Echange IN, 2011-2012, Semestre printemps',
       'Passerelle HES - IN, 2011-2012, Semestre automne',
       'Echange IN, 2011-2012, Semestre automne',
       'Passerelle HES - IN, 2013-2014, Semestre printemps',
       'Echange IN, 2013-2014, Semestre printemps',
       'Echange IN, 2013-2014, Semestre automne',
       'Passerelle HES - IN, 2013-2014, Semestre automne',
       'Echange IN, 2016-2017, Semestre printemps',
       'Echange IN, 2016-2017, Semestre automne',
       'Passerelle HES - IN, 2016-2017, Semestre automne',
       'Echange IN, 2010-2011, Semestre printemps',
       'Echange IN, 2010-2011, Semestre automne',
       'Passerelle HES - IN, 2010-2011, Semestre automne',
       'Echange IN, 2007-2008, Semestre printemps',
       'Echange IN, 2007-2008, Semestre automne'], dtype=object)

This are the students exchange (students from other universities in exchange at EPFL) and students doing the Passerelle HES. Since the exchange students don't do a full master at EPFL we ignore them. Also we won't count students in Passerelle HES as beeing in the Master just yet because they have to succeed the passerelle to optain a master. So if they do a master they are inscribed in Master anyways.



In [8]:

    
master_periode_pedago = ['Master semestre 1', 'Master semestre 2', 'Projet Master printemps', 'Master semestre 3', 'Projet Master automne']
master_data = all_data[all_data['periode_pedago'].isin(master_periode_pedago)]

remove the columns with only NaN (in case there is one)



In [9]:

    
master_data.dropna(axis=1, how='all', inplace=True)

Rename the columns (remove the é and give shorter names)



In [10]:

    
master_data.rename(columns={'Civilité': 'Sex', 'Nom_Prénom': 'Name', 'Spécialisation': 'Specialisation'}, inplace=True)



In [11]:

    
master_data['periode_pedago'].unique()









    Out[11]:





array(['Master semestre 1', 'Master semestre 2', 'Projet Master printemps',
       'Master semestre 3', 'Projet Master automne'], dtype=object)

add some columns that make the use of the data easier later

store the start and end year of each entry ('2008-2009' -> 2008) and '2008-2009' -> 2009
make a column for master1, master2, master3 and and project master and indicate if done it (true and false)



In [12]:

    
master_data['periode_acad'].unique()









    Out[12]:





array(['2009-2010', '2014-2015', '2012-2013', '2008-2009', '2015-2016',
       '2011-2012', '2013-2014', '2016-2017', '2010-2011', '2007-2008'], dtype=object)



In [13]:

    
# store the year of the entry
def start_year(student):
    return int (student['periode_acad'].split('-')[0])
master_data['start_year'] = master_data.apply(start_year, axis=1)
master_data['end_year'] = master_data.apply(lambda st: start_year(st)+1, axis=1)



In [14]:

    
# make the indicator columns
new_cols_map = {
    'Master1': ['Master semestre 1'],
    'Master2': ['Master semestre 2'],
    'Master3': ['Master semestre 3'],
    'Project_Master' : ['Projet Master printemps', 'Projet Master automne']
}
for (new_col_name, match_list) in new_cols_map.items():
    master_data[new_col_name] = master_data.apply(lambda student: student['periode_pedago'] in match_list, axis=1)



In [15]:

    
# show the new master_data (sample randomly to keep it managable but still informative)
master_data.sample(n = 10, axis=0, replace=False)









    Out[15]:






  
    
      
      Sex
      Name
      Specialisation
      Mineur
      Statut
      Type_Echange
      Ecole_Echange
      No_Sciper
      title
      periode_acad
      periode_pedago
      start_year
      end_year
      Master3
      Master2
      Project_Master
      Master1
    
  
  
    
      3095
      Madame
      Benabdallah Zeineb
      Biocomputing
      NaN
      Présent
      NaN
      NaN
      154573
      Informatique, 2008-2009, Master semestre 2
      2008-2009
      Master semestre 2
      2008
      2009
      False
      True
      False
      False
    
    
      3337
      Monsieur
      Hottelier Thibaud
      Foundations of Software
      NaN
      Présent
      NaN
      NaN
      166229
      Informatique, 2008-2009, Master semestre 3
      2008-2009
      Master semestre 3
      2008
      2009
      True
      False
      False
      False
    
    
      4575
      Monsieur
      Hahling Robin Philippe
      NaN
      NaN
      Présent
      NaN
      NaN
      233184
      Informatique, 2015-2016, Projet Master automne
      2015-2016
      Projet Master automne
      2015
      2016
      False
      False
      True
      False
    
    
      5750
      Monsieur
      Martres Guillaume André Fradji
      NaN
      NaN
      Stage
      NaN
      NaN
      203085
      Informatique, 2013-2014, Master semestre 1
      2013-2014
      Master semestre 1
      2013
      2014
      False
      False
      False
      True
    
    
      2948
      Monsieur
      Popovic Miroslav
      NaN
      NaN
      Présent
      NaN
      NaN
      183055
      Informatique, 2008-2009, Master semestre 1
      2008-2009
      Master semestre 1
      2008
      2009
      False
      False
      False
      True
    
    
      329
      Monsieur
      Önal Cagri
      Computer Engineering - SP
      NaN
      Présent
      NaN
      NaN
      191354
      Informatique, 2009-2010, Master semestre 2
      2009-2010
      Master semestre 2
      2009
      2010
      False
      True
      False
      False
    
    
      4790
      Monsieur
      Hardy Diggory Benjamin
      NaN
      NaN
      Présent
      NaN
      NaN
      210637
      Informatique, 2011-2012, Master semestre 1
      2011-2012
      Master semestre 1
      2011
      2012
      False
      False
      False
      True
    
    
      3124
      Monsieur
      Koller Yannick
      Biocomputing
      NaN
      Présent
      NaN
      NaN
      169734
      Informatique, 2008-2009, Master semestre 2
      2008-2009
      Master semestre 2
      2008
      2009
      False
      True
      False
      False
    
    
      6884
      Monsieur
      Lazarev Nikita
      NaN
      NaN
      Présent
      NaN
      NaN
      268769
      Informatique, 2016-2017, Master semestre 1
      2016-2017
      Master semestre 1
      2016
      2017
      False
      False
      False
      True
    
    
      8869
      Monsieur
      Zheng Da
      NaN
      NaN
      Stage
      NaN
      NaN
      172221
      Informatique, 2007-2008, Master semestre 3
      2007-2008
      Master semestre 3
      2007
      2008
      True
      False
      False
      False

We will remove all students that did certainly not finish the master. That is, we remove students that:

Have not done the Master 1 semestre. (There are not a lot of them (see later) and it is too cumbersome to track them and check if they eventually passed or not)
Did only 1 semester. (They did not pass)
Have a mineur or spec and did less than 3 semesters. (They also did not pass)
Are registered in the current semester (have not passed (yet))

We also remove students that already have been studing in 2007 (the start of our data) because we can not tell how long they already have studied before.



In [16]:

    
# find all students that have not done the 'master 1' semester -> have not studied enough to finish the master.
grouped = master_data.groupby(by='No_Sciper')
no_master_1 = pd.DataFrame(columns=['Civilité', 'Nom_Prénom', 'Orientation_Bachelor', 'Orientation_Master',
      'Spécialisation', 'Filière_opt.', 'Mineur', 'Statut', 'Type_Echange',
      'Ecole_Echange', 'No_Sciper', 'title', 'periode_acad',
      'periode_pedago']) 
for scip, group in grouped:
    if (group.periode_pedago != 'Master semestre 1').all():
        no_master_1 = pd.concat([no_master_1, group])
len(no_master_1.No_Sciper.unique())









    Out[16]:





76



In [17]:

    
# all the students that already studied in the '2007-2008' year:
df_2006 = no_master_1[no_master_1.periode_acad == '2007-2008']
len(df_2006.No_Sciper.unique())









    Out[17]:





66

remove the found students. The others will be removed later.



In [18]:

    
sciper_to_remove = no_master_1['No_Sciper'].unique()
master_data = master_data[~master_data['No_Sciper'].isin(sciper_to_remove)]

Find all the students that are registered in the current semester and filter them.



In [19]:

    
study_now = pd.DataFrame(columns=['Civilité', 'Nom_Prénom', 'Orientation_Bachelor', 'Orientation_Master',
      'Spécialisation', 'Filière_opt.', 'Mineur', 'Statut', 'Type_Echange',
      'Ecole_Echange', 'No_Sciper', 'title', 'periode_acad',
      'periode_pedago']) 
for scip, group in grouped:
    if (group.periode_acad == '2016-2017').any():
        study_now = pd.concat([study_now, group])
len(study_now.No_Sciper.unique())









    Out[19]:





315

... and remove them to



In [20]:

    
sciper_to_remove = study_now['No_Sciper'].unique()
master_data = master_data[~master_data['No_Sciper'].isin(sciper_to_remove)]

Groupby and aggregate by student



In [21]:

    
def group_master_data(grouped_entries):
    # check that there are no two students with the same sciper number and different names or sex.
    must_be_unique_list = ['Sex', 'Name'] #No_Scyper also but we group by it -> unique by construction
    for unique_col in must_be_unique_list:
        if(len(grouped_entries[unique_col].unique()) > 1): raise ValueError('Two students of different '+unique_col+' with same No_Sciper') 
    
    #aggregate the cols
    first_entry = grouped_entries.head(1)
    df_map = {
        'No_Sciper' : first_entry['No_Sciper'].values[0],
        'Name' : first_entry['Name'].values[0],
        'Sex' : first_entry['Sex'].values[0],
        'Specialisation' : grouped_entries['Specialisation'].dropna().unique(), # all the spcs the student was inscribed to
        'Mineur' : grouped_entries['Mineur'].dropna().unique(),# all the minors the student was inscribed to
        'first_year': grouped_entries['start_year'].min(), # smallest start year entry
        'last_year' : grouped_entries['end_year'].max(), # highest year a studend appears
        #some brainfuck lines ;)
        'first_semestre' : grouped_entries.sort_values(by=['start_year', 'periode_pedago'], axis=0, ascending=True)['periode_pedago'].values[0], # eg master1
        'last_semestre' : grouped_entries.sort_values(by=['end_year', 'periode_pedago'], axis=0, ascending=False)['periode_pedago'].values[0], # the name of the last semester (eg. master 3)
        'semesters_done' : grouped_entries.sort_values(by=['end_year'])['periode_pedago'].values,
        'nombre_semestres' : len(grouped_entries), # how many different semesters the student did at epfl
        'project_master' : grouped_entries['Project_Master'].sum() > 0 # if student did the master project
    } 
    # if the student did any minor or spec
    df_map['mineur_or_spe'] = len(df_map['Specialisation']) + len(df_map['Mineur']) > 0
    # True if the student has not (yet) finished the master
    df_map['to_remove'] = df_map['nombre_semestres'] <= 1 or (not df_map['mineur_or_spe'] and df_map['nombre_semestres'] <= 2)
    
    # if there are two spe, take the latest one
    if len(df_map['Specialisation']) > 1:
        df_map['Specialisation'] = grouped_entries[grouped_entries['end_year'] == df_map['last_year']]['Specialisation'].values[0]
    elif len(df_map['Specialisation']) == 1:
        df_map['Specialisation'] = df_map['Specialisation'][0] # take the spe out of the array
        
    # set correct NaNs
    if len(df_map['Mineur']) == 0:
        df_map['Mineur'] = np.nan
    if len(df_map['Specialisation']) == 0:
        df_map['Specialisation'] = np.nan
    
    # make Dataframe
    for (k, v) in df_map.items():
        df_map[k] = [v]
    return pd.DataFrame.from_dict(df_map)
    
    
grouped_master = master_data.groupby(by='No_Sciper', as_index=False, sort=True).apply(group_master_data)
grouped_master.head()









    Out[21]:






  
    
      
      
      Mineur
      Name
      No_Sciper
      Sex
      Specialisation
      first_semestre
      first_year
      last_semestre
      last_year
      mineur_or_spe
      nombre_semestres
      project_master
      semesters_done
      to_remove
    
  
  
    
      0
      0
      NaN
      Droz-dit-Busset Arnault
      138088
      Monsieur
      NaN
      Master semestre 1
      2007
      Master semestre 1
      2008
      False
      1
      False
      [Master semestre 1]
      True
    
    
      1
      0
      NaN
      Cardinaux Damien
      146330
      Monsieur
      NaN
      Master semestre 1
      2007
      Master semestre 3
      2009
      False
      3
      False
      [Master semestre 1, Master semestre 2, Master ...
      False
    
    
      2
      0
      NaN
      Marx Clément
      146742
      Monsieur
      Signals, Images and Interfaces
      Master semestre 1
      2008
      Master semestre 3
      2013
      True
      7
      True
      [Master semestre 1, Master semestre 2, Master ...
      False
    
    
      3
      0
      NaN
      Junod Antoine
      146929
      Monsieur
      NaN
      Master semestre 1
      2007
      Master semestre 2
      2008
      False
      2
      False
      [Master semestre 1, Master semestre 2]
      True
    
    
      4
      0
      NaN
      Good Xavier
      147008
      Monsieur
      Signals, Images and Interfaces
      Master semestre 1
      2011
      Master semestre 3
      2013
      True
      4
      False
      [Master semestre 1, Master semestre 2, Master ...
      False

remove the students that have not finished their master (described above)



In [22]:

    
students_to_remove = grouped_master[grouped_master['to_remove']]
len(students_to_remove)









    Out[22]:





127



In [23]:

    
grouped_master = grouped_master[~grouped_master['to_remove']]

Set the Sciper number as index



In [24]:

    
grouped_master.set_index('No_Sciper', inplace=True)
len(grouped_master)









    Out[24]:





461

How many month did it take each student

First without any further cleaning and thinking



In [25]:

    
master_epfl = grouped_master.copy()



In [26]:

    
plt.hist(master_epfl['nombre_semestres'], bins=8, range=[1, 9])









    Out[26]:





(array([   0.,    4.,  252.,  142.,   48.,   12.,    2.,    1.]),
 array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.]),
 <a list of 8 Patch objects>)



In [27]:

    
master_epfl.describe()









    Out[27]:






  
    
      
      first_year
      last_year
      nombre_semestres
    
  
  
    
      count
      461.000000
      461.000000
      461.000000
    
    
      mean
      2010.644252
      2012.800434
      3.613883
    
    
      std
      2.073464
      2.083913
      0.846054
    
    
      min
      2007.000000
      2009.000000
      2.000000
    
    
      25%
      2009.000000
      2011.000000
      3.000000
    
    
      50%
      2011.000000
      2013.000000
      3.000000
    
    
      75%
      2012.000000
      2015.000000
      4.000000
    
    
      max
      2014.000000
      2016.000000
      8.000000

The mean of the number of semesters is around 3.6 semesters. However not all students are registered in the the Master Project (missing data?), but in the plot above we assume everyone finished their master. So we add to all students that are not inscribed in a Master Project one semester:



In [28]:

    
def add_semester_if_no_PM(row):
    t = row.nombre_semestres
    if(not row.project_master):
        return t + 1
    else:
        return t
        
master_added_sem = master_epfl.copy()
master_added_sem['nombre_semestres'] = master_epfl.apply(add_semester_if_no_PM, axis=1)



In [29]:

    
master_added_sem.describe()









    Out[29]:






  
    
      
      first_year
      last_year
      nombre_semestres
    
  
  
    
      count
      461.000000
      461.000000
      461.000000
    
    
      mean
      2010.644252
      2012.800434
      4.492408
    
    
      std
      2.073464
      2.083913
      0.843419
    
    
      min
      2007.000000
      2009.000000
      3.000000
    
    
      25%
      2009.000000
      2011.000000
      4.000000
    
    
      50%
      2011.000000
      2013.000000
      4.000000
    
    
      75%
      2012.000000
      2015.000000
      5.000000
    
    
      max
      2014.000000
      2016.000000
      8.000000

It made a big difference. But that seems reasonable since most students do a Mineur/Spe which makes already (at least) 4 semesters (MP included)



In [30]:

    
plt.hist(master_added_sem['nombre_semestres'], bins=8, range=[1, 9])









    Out[30]:





(array([   0.,    0.,   19.,  267.,  118.,   45.,   10.,    2.]),
 array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.]),
 <a list of 8 Patch objects>)

And per specialisation

We will take the 'added_semester' data since we think it is more accurate. Note that the average number of semesters is only 0.3 higher than the one of all students.



In [31]:

    
master_with_spe = master_added_sem[~pd.isnull(master_added_sem['Specialisation'])]
master_with_spe.describe()









    Out[31]:






  
    
      
      first_year
      last_year
      nombre_semestres
    
  
  
    
      count
      174.000000
      174.000000
      174.000000
    
    
      mean
      2010.022989
      2012.241379
      4.729885
    
    
      std
      2.070866
      2.147538
      0.900561
    
    
      min
      2007.000000
      2009.000000
      3.000000
    
    
      25%
      2008.000000
      2010.000000
      4.000000
    
    
      50%
      2010.000000
      2012.000000
      5.000000
    
    
      75%
      2012.000000
      2014.000000
      5.000000
    
    
      max
      2014.000000
      2016.000000
      7.000000

almost 50% of all students do a minor or a specialisation:



In [32]:

    
# how many percent of students do a specialisation
100/len(master_added_sem) * len(master_with_spe)









    Out[32]:





37.74403470715835



In [33]:

    
# how many percent of students do a minor
100/len(master_added_sem) * len(master_added_sem[~pd.isnull(master_added_sem['Mineur'])])









    Out[33]:





12.79826464208243

What are the different specialisations and how popular are they?



In [34]:

    
# The different specialisations
master_with_spe['Specialisation'].unique()









    Out[34]:





array(['Signals, Images and Interfaces', 'Internet computing',
       'Biocomputing', 'Foundations of Software',
       'Computer Engineering - SP', 'Service science', 'Software Systems'], dtype=object)

A nice plot showing that there is a huge difference in the number of people taking the different Specialisations



In [35]:

    
sns.countplot(y="Specialisation", data=master_with_spe);

In the time taken to colmplete their studies they don't differ a lot. (The red dotted line is the averarge of all students, the blue line the average of the students with specialisations, the black bar is the confidence interval)



In [36]:

    
ax = sns.barplot(x='nombre_semestres', y='Specialisation', data=master_with_spe);
plt.axvline(master_with_spe['nombre_semestres'].mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(master_added_sem['nombre_semestres'].mean(), color='r', linestyle='dashed', linewidth=2)









    Out[36]:





<matplotlib.lines.Line2D at 0x7fbdc846a198>

Compared to all master students, it seems that only the first two specs actually inply a longer stay at EPFL (their confidence interval just barely don't touch the general mean)



In [37]:

    
def calc_diff_from_average(col, average):
    return (lambda row: row[col] - average)
master_with_spe['diff_from_average'] = master_with_spe.apply(calc_diff_from_average('nombre_semestres', master_added_sem['nombre_semestres'].mean()), axis=1)



In [38]:

    
sns.barplot(x='diff_from_average', y='Specialisation', data=master_with_spe);

Test if the difference in average is statistically significant for each spec



In [39]:

    
import scipy.stats as stats

We want to see if the difference of the average number of semesters for a particular specialisation and for all the master students in informatique are statistically significant with a threshold of 95%

We use a Welch's T-Test (which does not assume equal population variance): it measures whether the average value differs significantly across samples.



In [40]:

    
specialisation = master_with_spe.Specialisation.unique()

for spec in specialisation:
    print(spec)
    data_spec = master_with_spe[master_with_spe.Specialisation == spec]
    print (stats.ttest_ind(a = data_spec.nombre_semestres, b= master_added_sem.nombre_semestres, equal_var=False))
    print('\n')









    



Signals, Images and Interfaces
Ttest_indResult(statistic=2.1283292284075954, pvalue=0.044125577358993986)


Internet computing
Ttest_indResult(statistic=2.220772029835635, pvalue=0.028748167488286149)


Biocomputing
Ttest_indResult(statistic=0.87714755819307277, pvalue=0.44443863031319558)


Foundations of Software
Ttest_indResult(statistic=1.7541617707686779, pvalue=0.084619810630796433)


Computer Engineering - SP
Ttest_indResult(statistic=-1.0787477274177004, pvalue=0.3006969215079382)


Service science
Ttest_indResult(statistic=-0.47394379657037139, pvalue=0.6811170614366413)


Software Systems
Ttest_indResult(statistic=0.039333365228377604, pvalue=0.9696357967944409)

Only Signals, Images and Interfaces and Internet computing have a pvalue < 0.05 (pvalue = 0.044 and 0.028 resp.) thus we can reject the null hypothesis and tell that the difference is significant.

For all other specialisation we cannot reject the null hypothesis of identical average scores (pvalue > 0.05), which means: for those specialisation we cannot say that the difference is in average statistically significant

Female vs Male (Optional)

Except for the number of students, male and females are quite similar



In [41]:

    
Female = master_added_sem[master_added_sem.Sex == 'Madame']
Female.describe()









    Out[41]:






  
    
      
      first_year
      last_year
      nombre_semestres
    
  
  
    
      count
      64.000000
      64.000000
      64.000000
    
    
      mean
      2010.828125
      2012.968750
      4.484375
    
    
      std
      1.830558
      1.868441
      0.959368
    
    
      min
      2007.000000
      2009.000000
      3.000000
    
    
      25%
      2010.000000
      2012.000000
      4.000000
    
    
      50%
      2011.000000
      2013.000000
      4.000000
    
    
      75%
      2012.000000
      2014.250000
      5.000000
    
    
      max
      2014.000000
      2016.000000
      8.000000



In [42]:

    
Male = master_added_sem[master_added_sem.Sex == 'Monsieur']
Male.describe()









    Out[42]:






  
    
      
      first_year
      last_year
      nombre_semestres
    
  
  
    
      count
      397.000000
      397.000000
      397.000000
    
    
      mean
      2010.614610
      2012.773300
      4.493703
    
    
      std
      2.110601
      2.117506
      0.824551
    
    
      min
      2007.000000
      2009.000000
      3.000000
    
    
      25%
      2009.000000
      2011.000000
      4.000000
    
    
      50%
      2011.000000
      2013.000000
      4.000000
    
    
      75%
      2012.000000
      2015.000000
      5.000000
    
    
      max
      2014.000000
      2016.000000
      8.000000

We want to see if the difference of the average years for female and male are statistically significant with a threshold of 95%

We use a Welch's T-Test (which does not assume equal population variance): it measures whether the average value differs significantly across samples.



In [43]:

    
stats.ttest_ind(a = Female.nombre_semestres, b= Male.nombre_semestres, equal_var=False)









    Out[43]:





Ttest_indResult(statistic=-0.073527746097899985, pvalue=0.94157267816404788)

Since the pvalue is > 0.05, we cannot reject the null hypothesis of identical average scores which means: we cannot say that the difference is in average statistically significant, in fact it is quite similar (p value close to 1)

Also over time nothing much changed. The females might have a small upwards trend, but the confidence intervall is quite big.



In [44]:

    
sns.regplot(x=Male.last_year, y=Male.nombre_semestres, marker="o",ci=95, label='Male', scatter_kws={'s':100})
ax =sns.regplot(x=Female.last_year, y=Female.nombre_semestres, marker="d", ci= 95, label='Female', color='r')
ax.legend(loc="best")









    Out[44]:





<matplotlib.legend.Legend at 0x7fbdc83b4048>



In [ ]:

	Civilité	Nom_Prénom	Orientation_Master	Spécialisation	Filière_opt.	Mineur	Statut	Type_Echange	Ecole_Echange	No_Sciper	title	periode_acad	periode_pedago
7567	Monsieur	Martin Damien	NaN	NaN	NaN	NaN	Présent	NaN	NaN	121367	Passerelle HES - IN, 2016-2017, Semestre automne	2016-2017	Semestre automne
8828	Monsieur	Gulati Asheesh	NaN	Internet computing	NaN	NaN	Présent	NaN	NaN	128911	Informatique, 2007-2008, Master semestre 3	2007-2008	Master semestre 3
8602	Monsieur	Gulati Asheesh	NaN	Internet computing	NaN	NaN	Présent	NaN	NaN	128911	Informatique, 2007-2008, Master semestre 2	2007-2008	Master semestre 2
8870	Monsieur	Zhou Maoan	NaN	NaN	NaN	NaN	Stage	NaN	NaN	129093	Informatique, 2007-2008, Master semestre 3	2007-2008	Master semestre 3
8654	Monsieur	Zhou Maoan	NaN	NaN	NaN	NaN	Présent	NaN	NaN	129093	Informatique, 2007-2008, Master semestre 2	2007-2008	Master semestre 2
7563	Monsieur	Essellak Radouane	NaN	NaN	NaN	NaN	Présent	NaN	NaN	129094	Passerelle HES - IN, 2016-2017, Semestre automne	2016-2017	Semestre automne
8848	Monsieur	Ni Zhong Zhong	NaN	NaN	NaN	NaN	Présent	NaN	NaN	129326	Informatique, 2007-2008, Master semestre 3	2007-2008	Master semestre 3
8631	Monsieur	Ni Zhong Zhong	NaN	NaN	NaN	NaN	Présent	NaN	NaN	129326	Informatique, 2007-2008, Master semestre 2	2007-2008	Master semestre 2
8407	Monsieur	Droz-dit-Busset Arnault	NaN	NaN	NaN	NaN	Présent	NaN	NaN	138088	Informatique, 2007-2008, Master semestre 1	2007-2008	Master semestre 1
8813	Monsieur	Clivaz Jean-Philippe	NaN	NaN	NaN	NaN	Présent	NaN	NaN	145546	Informatique, 2007-2008, Master semestre 3	2007-2008	Master semestre 3

	Sex	Name	Specialisation	Mineur	Statut	Type_Echange	Ecole_Echange	No_Sciper	title	periode_acad	periode_pedago	start_year	end_year	Master3	Master2	Project_Master	Master1
3095	Madame	Benabdallah Zeineb	Biocomputing	NaN	Présent	NaN	NaN	154573	Informatique, 2008-2009, Master semestre 2	2008-2009	Master semestre 2	2008	2009	False	True	False	False
3337	Monsieur	Hottelier Thibaud	Foundations of Software	NaN	Présent	NaN	NaN	166229	Informatique, 2008-2009, Master semestre 3	2008-2009	Master semestre 3	2008	2009	True	False	False	False
4575	Monsieur	Hahling Robin Philippe	NaN	NaN	Présent	NaN	NaN	233184	Informatique, 2015-2016, Projet Master automne	2015-2016	Projet Master automne	2015	2016	False	False	True	False
5750	Monsieur	Martres Guillaume André Fradji	NaN	NaN	Stage	NaN	NaN	203085	Informatique, 2013-2014, Master semestre 1	2013-2014	Master semestre 1	2013	2014	False	False	False	True
2948	Monsieur	Popovic Miroslav	NaN	NaN	Présent	NaN	NaN	183055	Informatique, 2008-2009, Master semestre 1	2008-2009	Master semestre 1	2008	2009	False	False	False	True
329	Monsieur	Önal Cagri	Computer Engineering - SP	NaN	Présent	NaN	NaN	191354	Informatique, 2009-2010, Master semestre 2	2009-2010	Master semestre 2	2009	2010	False	True	False	False
4790	Monsieur	Hardy Diggory Benjamin	NaN	NaN	Présent	NaN	NaN	210637	Informatique, 2011-2012, Master semestre 1	2011-2012	Master semestre 1	2011	2012	False	False	False	True
3124	Monsieur	Koller Yannick	Biocomputing	NaN	Présent	NaN	NaN	169734	Informatique, 2008-2009, Master semestre 2	2008-2009	Master semestre 2	2008	2009	False	True	False	False
6884	Monsieur	Lazarev Nikita	NaN	NaN	Présent	NaN	NaN	268769	Informatique, 2016-2017, Master semestre 1	2016-2017	Master semestre 1	2016	2017	False	False	False	True
8869	Monsieur	Zheng Da	NaN	NaN	Stage	NaN	NaN	172221	Informatique, 2007-2008, Master semestre 3	2007-2008	Master semestre 3	2007	2008	True	False	False	False

	first_year	last_year	nombre_semestres
count	461.000000	461.000000	461.000000
mean	2010.644252	2012.800434	3.613883
std	2.073464	2.083913	0.846054
min	2007.000000	2009.000000	2.000000
25%	2009.000000	2011.000000	3.000000
50%	2011.000000	2013.000000	3.000000
75%	2012.000000	2015.000000	4.000000
max	2014.000000	2016.000000	8.000000

	first_year	last_year	nombre_semestres
count	174.000000	174.000000	174.000000
mean	2010.022989	2012.241379	4.729885
std	2.070866	2.147538	0.900561
min	2007.000000	2009.000000	3.000000
25%	2008.000000	2010.000000	4.000000
50%	2010.000000	2012.000000	5.000000
75%	2012.000000	2014.000000	5.000000
max	2014.000000	2016.000000	7.000000

	first_year	last_year	nombre_semestres
count	64.000000	64.000000	64.000000
mean	2010.828125	2012.968750	4.484375
std	1.830558	1.868441	0.959368
min	2007.000000	2009.000000	3.000000
25%	2010.000000	2012.000000	4.000000
50%	2011.000000	2013.000000	4.000000
75%	2012.000000	2014.250000	5.000000
max	2014.000000	2016.000000	8.000000

	first_year	last_year	nombre_semestres
count	397.000000	397.000000	397.000000
mean	2010.614610	2012.773300	4.493703
std	2.110601	2.117506	0.824551
min	2007.000000	2009.000000	3.000000
25%	2009.000000	2011.000000	4.000000
50%	2011.000000	2013.000000	4.000000
75%	2012.000000	2015.000000	5.000000
max	2014.000000	2016.000000	8.000000