For a full experience, view on this notebook viewer

Imports



In [1]:

    
# data science
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn import linear_model  # or others

# plot
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={"figure.figsize": (15, 6)})
sns.set_palette(sns.color_palette("Set2", 10))
#sns.set_style("whitegrid")

# core
import os
import glob
import pickle

# scraping
import requests # as req
from bs4 import BeautifulSoup
import json

# maps
import folium
import branca

# network
import networkx as nx

# ui - flow
from tqdm import tqdm_notebook as tqdm



In [2]:

    
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

Data



In [3]:

    
ice_cream = pd.read_csv("./data_cluedo/dataset/icecream.csv")

hw01_data_folder = '../01 - Pandas and Data Wrangling/Data'
sl_folder = hw01_data_folder + '/ebola/sl_data'

hw03_data_folder = '../03 - Interactive Viz'
europe_df = pd.read_csv(hw03_data_folder + '/data/european_rate_2016.csv')
europe_topo_path = hw03_data_folder + '/topojson/europe.topojson.json'
europe_json_data = json.load(open(europe_topo_path, encoding="UTF-8"))

hw04_data_folder = '../04 - Applied ML'
lalonde_data = pd.read_csv(hw04_data_folder + '/lalonde.csv')



In [ ]:

Code

basic (index, (i)loc, reset, counts)



In [4]:

    
df = ice_cream
df.index = ['index ' + str(i) + ' !!' for i in range(12)]
df.index.name = 'index_name'

df.loc['index 10 !!']
df.iloc[0]

df.reset_index()
df['month'].value_counts()

df;

pivot table



In [5]:

    
df









    Out[5]:







  
    
      
      month
      ice_cream_sales
      temperature
      deaths_drowning
      humidity
    
    
      index_name
      
      
      
      
      
    
  
  
    
      index 0 !!
      12
      4.75
      40
      2
      30
    
    
      index 1 !!
      11
      4.78
      50
      3
      20
    
    
      index 2 !!
      1
      4.82
      55
      4
      70
    
    
      index 3 !!
      2
      4.83
      58
      4
      70
    
    
      index 4 !!
      3
      4.84
      60
      5
      20
    
    
      index 5 !!
      10
      4.88
      55
      6
      30
    
    
      index 6 !!
      5
      4.91
      68
      9
      20
    
    
      index 7 !!
      9
      4.92
      70
      9
      10
    
    
      index 8 !!
      4
      4.93
      75
      8
      50
    
    
      index 9 !!
      7
      4.93
      80
      11
      10
    
    
      index 10 !!
      6
      4.94
      83
      12
      90
    
    
      index 11 !!
      8
      4.95
      88
      11
      50



In [6]:

    
categories_1 = [cat for _ in range(len(df) // 2) for cat in ['A', 'B']]
categories_2 = ['CAT_0' + str(i//4 + 1) for i in range(len(df))]

df['col_index'] = categories_1
df['row_index'] = categories_2

df.pivot_table(index='row_index', columns='col_index', values=['ice_cream_sales', 'month'], aggfunc=[np.min, np.max])









    Out[6]:







  
    
      
      amin
      amax
    
    
      
      ice_cream_sales
      month
      ice_cream_sales
      month
    
    
      col_index
      A
      B
      A
      B
      A
      B
      A
      B
    
    
      row_index
      
      
      
      
      
      
      
      
    
  
  
    
      CAT_01
      4.75
      4.78
      1
      2
      4.82
      4.83
      12
      11
    
    
      CAT_02
      4.84
      4.88
      3
      9
      4.91
      4.92
      5
      10
    
    
      CAT_03
      4.93
      4.93
      4
      7
      4.94
      4.95
      6
      8

pickle (save as binaries)



In [7]:

    
r = requests.get('https://epfl.ch')



In [8]:

    
dict_to_save = {
    'epfl': r.text,
    'test': 'Hi mate!',
}



In [9]:

    
with open('test.pickle', 'wb') as pickle_file:
    pickle.dump(dict_to_save, pickle_file)



In [10]:

    
with open("test.pickle","rb") as pickle_file:
    dict_loaded = pickle.load(pickle_file)



In [11]:

    
dict_to_save == dict_loaded









    Out[11]:





True

HW01

list files



In [12]:

    
glob.glob('./homeworks/*/*.ipynb')









    Out[12]:





[]

read_csv - concat - rename - fill_na - index



In [13]:

    
sl_df_list = [pd.read_csv(path) for path in glob.glob(sl_folder + '/*.csv')]

sl_df = pd.concat(sl_df_list, axis=0)

sl_df['Country'] = 'SL'

rename_dict = {
    'date': 'Date',
    'variable': 'Info',
    'National': 'Total'
}
sl_df = sl_df.rename(columns=rename_dict)

sl_df.fillna(0, inplace=True)

sl_df.index = [sl_df['Country'], sl_df['Date']]
sl_df.index.rename(['Country', 'Date'], inplace=True)

apply - mask - to_numeric - replace - groupby - agg - plot



In [14]:

    
cases_info_to_keep = ['Total cases of confirmed', 'Total confirmed cases', 'cum_confirmed']

mask = sl_df['Info'].apply(lambda x: x in cases_info_to_keep)

sl_df['Cases'] = np.where(mask, pd.to_numeric(sl_df['Total'].replace(',|%', '', regex=True)), 0)

sl_cases = sl_df['Cases'].groupby('Date').agg(sum)

titanic - plots - categorical - cut - value_counts



In [15]:

    
tit = pd.read_excel(hw01_data_folder+'/titanic.xls')

def change_type(df, col, type_):
    df[col] = df[col].astype(type_)
    
# CATEGORICAL
    
change_type(tit, 'pclass', 'category')
change_type(tit, 'embarked', 'category')
change_type(tit, 'sex', 'category')

# CUT

tit['age'] = pd.cut(tit['age'], np.linspace(0, 90, 10, dtype=int), right=False)

# COUNTPLOT

#sns.countplot(x="pclass", data=tit)
#sns.countplot(x='age', data=tit)

HW02

requests.get(url).json()



In [16]:

    
timeshighereducation_url = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

ranking_brute = requests.get(timeshighereducation_url).json()['data']

infos_to_keep = ["rank", "name", "location", "stats_number_students", "stats_pc_intl_students", "stats_student_staff_ratio"]
column_names =  ["Rank", "University", "Country", "# Students", "% Int. students", "% Faculty members"]

df_times = pd.DataFrame(ranking_brute[:200], index=range(1, 201), columns=infos_to_keep)

soup



In [17]:

    
# SEE sciper.csv

sciper = 238122
r = requests.get("https://people.epfl.ch/" + str(sciper))
soup = BeautifulSoup(r.text, "html.parser")

soup.h1, \
soup.find("div", {"class", "parents"}).span.string









    Out[17]:





(<h1>Grégoire Clément</h1>, 'SC-MA2')

regex - merge



In [18]:

    
'''

df2.reset_index().merge(df1, how='inner', on='University_regex').set_index('University')

'''









    Out[18]:





"\n\ndf2.reset_index().merge(df1, how='inner', on='University_regex').set_index('University')\n\n"

PCA



In [19]:

    
from sklearn.decomposition import PCA

HW03

colorscale



In [20]:

    
rate_min = min(europe_df['Rate'])
rate_max = max(europe_df['Rate'])

color_scale = branca.colormap.linear.OrRd.scale(rate_min, rate_max)
color_scale = color_scale.to_step(n=8)
color_scale.caption = 'Unemployment Rate (%)'

style function



In [21]:

    
def style_function(country):
    rate = europe_df.loc[europe_df['Name'] == country['properties']['NAME']]['Rate'].values
    if len(rate) > 0: 
        # country is in the dataframe
        return {
            'fillOpacity': 0.8,
            'weight': 0.5,
            'color': 'black',
            'fillColor': color_scale(rate)
        }
    else:
        # country is not in the dataframe, hence we put its color as black
        return {
            'fillOpacity': 0.2,
            'weight': 0.2,
            'color': 'black',
            'fillColor': 'black'
        }

folium



In [22]:

    
europe_unemployment_map = folium.Map(
    location=[53,22],
    tiles='cartodbpositron',
    zoom_start=4)



In [23]:

    
color_scale.add_to(europe_unemployment_map)









    Out[23]:



In [24]:

    
g = folium.TopoJson(
    data=europe_json_data,
    object_path='objects.europe',
    style_function=style_function,
).add_to(europe_unemployment_map)



In [25]:

    
europe_unemployment_map









    Out[25]:

HW04

propensity score



In [26]:

    
lr = LogisticRegression()
#Select features, that is drop id and treat columns
selectedFeatures = lalonde_data.drop(['id','treat'], axis=1)
#Fit the model
lr.fit(selectedFeatures, lalonde_data['treat']);

#Calculate the propensity scores
propensity_scores = lr.predict_proba(selectedFeatures)

#Only keep the probability of receiving the treatment and store it inside the dataframe
lalonde_data['propensity score'] = [x[1] for x in propensity_scores]

grid search



In [27]:

    
# use a full grid over max_depth and n_estimators parameters
param_grid = {
    #"max_depth": [3, 10, 20, None],
    "max_depth": [3, 10],
    "n_estimators": np.linspace(3, 200, num=5, dtype=int)
}

# run grid search
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
'''
grid_search.fit(X_validation, y_validation);
''';

regressor



In [28]:

    
X_train = None
y_train = None

clf = RandomForestClassifier()

'''
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
''';

Cluedo 01

correlation dataframe to correlation pairs



In [29]:

    
def df_to_corr_pairs(corr, limit, abs_=True):
    if abs_:
        corr = np.abs(corr)

    indices = corr.index
    corr_pairs = []

    for i, idx_i in enumerate(indices):
        for j, c in enumerate(corr[idx_i]):
            if c > limit and i < j:
                corr_pairs.append((idx_i, indices[j]))
    return corr_pairs

Plots

line plot



In [30]:

    
sl_cases.plot(kind='line', title='TITLE', grid=True, legend=True, xticks=range(len(sl_cases)));

bar plot



In [31]:

    
sns.barplot(x="sex", y="survived", hue="pclass", data=tit);

pie



In [32]:

    
tit['cabin'].dropna().astype(str).str[0].value_counts().plot(kind='pie')
plt.axis('equal');

KDE



In [33]:

    
sns.kdeplot(lalonde_data['re78'], data2=lalonde_data['age']);

heatmap



In [34]:

    
sns.heatmap(df.corr(), square=True);

proportions plot



In [35]:

    
def display_proportions(data, variables, n_cols=3, titles=None):
    N = len(variables)
    f, axes = plt.subplots(nrows=int(np.ceil(N / n_cols)), ncols=n_cols)
    f.set_figheight(10)
    if titles is None:
        titles = range(1, N+1)
    for idx, axis, var in zip(titles, axes.flatten(), variables):
        sns.barplot(x='treat', y=var, data=data, ax=axis)
        axis.set_xticklabels(["Control Group", "Treatment Group"])
        axis.set_xlabel("")
        axis.set_title(idx)
        axis.set_ylabel("mean of {}".format(var))
        
main_var = ['black', 'hispan', 'age', 'married', 'nodegree', 'educ']
display_proportions(lalonde_data, main_var)

Else



In [36]:

    
# this could have been done in a simpler way for this homework,
# but it might be useful to have such a powerful function for other uses,
# hence we decide to keep it here so that other could use it too :)

def split(X, y, ratios):
    """
    Split X and y given some ratios
    
    Parameters
    ----------
    X : ndarray
    train matrix
    
    y : ndarray
    test matrix
    
    ratios : list(int)
    ratios on how to split X and y

    Returns
    -------
    out : tuple(ndarray)
    Output one tuple of first, the splits of X and then, the splits of y 
    """
    assert np.sum(ratios) < 1, "sum of ratios cannot be greater than 1"
    assert len(ratios) >= 1, "at least one ratio required to split"
    
    def inner_split(X, y, ratios, acc_X, acc_y):
        ratio, *ratios_remaining = ratios
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=ratio)

        if len(ratios_remaining) == 0:
            acc_X.extend([X_train, X_test])
            acc_y.extend([y_train, y_test])
            acc_X.extend(acc_y)
            return tuple(acc_X)
        else:
            acc_X.append(X_train)
            acc_y.append(y_train)
            return inner_split(X_test, y_test, [r/(1.0 - ratio) for r in ratios_remaining], acc_X, acc_y)
    
    return inner_split(X, y, ratios, [], [])

Unit Test

correlation dataframe to correlation pairs



In [37]:

    
df_to_corr_pairs(ice_cream.corr(), limit=.9)









    Out[37]:





[('ice_cream_sales', 'temperature'),
 ('ice_cream_sales', 'deaths_drowning'),
 ('temperature', 'deaths_drowning')]

	month	ice_cream_sales	temperature	deaths_drowning	humidity
index_name
index 0 !!	12	4.75	40	2	30
index 1 !!	11	4.78	50	3	20
index 2 !!	1	4.82	55	4	70
index 3 !!	2	4.83	58	4	70
index 4 !!	3	4.84	60	5	20
index 5 !!	10	4.88	55	6	30
index 6 !!	5	4.91	68	9	20
index 7 !!	9	4.92	70	9	10
index 8 !!	4	4.93	75	8	50
index 9 !!	7	4.93	80	11	10
index 10 !!	6	4.94	83	12	90
index 11 !!	8	4.95	88	11	50

	amin				amax
	ice_cream_sales		month		ice_cream_sales		month
col_index	A	B	A	B	A	B	A	B
row_index
CAT_01	4.75	4.78	1	2	4.82	4.83	12	11
CAT_02	4.84	4.88	3	9	4.91	4.92	5	10
CAT_03	4.93	4.93	4	7	4.94	4.95	6	8

Table of Contents

Imports

Data

Code

basic (index, (i)loc, reset, counts)

pivot table

pickle (save as binaries)

HW01

list files

read_csv - concat - rename - fill_na - index

apply - mask - to_numeric - replace - groupby - agg - plot

titanic - plots - categorical - cut - value_counts

HW02

requests.get(url).json()

soup

regex - merge

PCA

HW03

colorscale

style function

folium

HW04

propensity score

grid search

regressor

Cluedo 01

correlation dataframe to correlation pairs

Plots

line plot

bar plot

pie

KDE

heatmap

proportions plot

Else

Unit Test

correlation dataframe to correlation pairs