For a full experience, view on this notebook viewer

Imports


In [1]:
# data science
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn import linear_model  # or others

# plot
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={"figure.figsize": (15, 6)})
sns.set_palette(sns.color_palette("Set2", 10))
#sns.set_style("whitegrid")

# core
import os
import glob
import pickle

# scraping
import requests # as req
from bs4 import BeautifulSoup
import json

# maps
import folium
import branca

# network
import networkx as nx

# ui - flow
from tqdm import tqdm_notebook as tqdm

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

Data


In [3]:
ice_cream = pd.read_csv("./data_cluedo/dataset/icecream.csv")

hw01_data_folder = '../01 - Pandas and Data Wrangling/Data'
sl_folder = hw01_data_folder + '/ebola/sl_data'

hw03_data_folder = '../03 - Interactive Viz'
europe_df = pd.read_csv(hw03_data_folder + '/data/european_rate_2016.csv')
europe_topo_path = hw03_data_folder + '/topojson/europe.topojson.json'
europe_json_data = json.load(open(europe_topo_path, encoding="UTF-8"))

hw04_data_folder = '../04 - Applied ML'
lalonde_data = pd.read_csv(hw04_data_folder + '/lalonde.csv')

In [ ]:

Code

basic (index, (i)loc, reset, counts)


In [4]:
df = ice_cream
df.index = ['index ' + str(i) + ' !!' for i in range(12)]
df.index.name = 'index_name'

df.loc['index 10 !!']
df.iloc[0]

df.reset_index()
df['month'].value_counts()

df;

pivot table


In [5]:
df


Out[5]:
month ice_cream_sales temperature deaths_drowning humidity
index_name
index 0 !! 12 4.75 40 2 30
index 1 !! 11 4.78 50 3 20
index 2 !! 1 4.82 55 4 70
index 3 !! 2 4.83 58 4 70
index 4 !! 3 4.84 60 5 20
index 5 !! 10 4.88 55 6 30
index 6 !! 5 4.91 68 9 20
index 7 !! 9 4.92 70 9 10
index 8 !! 4 4.93 75 8 50
index 9 !! 7 4.93 80 11 10
index 10 !! 6 4.94 83 12 90
index 11 !! 8 4.95 88 11 50

In [6]:
categories_1 = [cat for _ in range(len(df) // 2) for cat in ['A', 'B']]
categories_2 = ['CAT_0' + str(i//4 + 1) for i in range(len(df))]

df['col_index'] = categories_1
df['row_index'] = categories_2

df.pivot_table(index='row_index', columns='col_index', values=['ice_cream_sales', 'month'], aggfunc=[np.min, np.max])


Out[6]:
amin amax
ice_cream_sales month ice_cream_sales month
col_index A B A B A B A B
row_index
CAT_01 4.75 4.78 1 2 4.82 4.83 12 11
CAT_02 4.84 4.88 3 9 4.91 4.92 5 10
CAT_03 4.93 4.93 4 7 4.94 4.95 6 8

pickle (save as binaries)


In [7]:
r = requests.get('https://epfl.ch')

In [8]:
dict_to_save = {
    'epfl': r.text,
    'test': 'Hi mate!',
}

In [9]:
with open('test.pickle', 'wb') as pickle_file:
    pickle.dump(dict_to_save, pickle_file)

In [10]:
with open("test.pickle","rb") as pickle_file:
    dict_loaded = pickle.load(pickle_file)

In [11]:
dict_to_save == dict_loaded


Out[11]:
True

HW01

list files


In [12]:
glob.glob('./homeworks/*/*.ipynb')


Out[12]:
[]

read_csv - concat - rename - fill_na - index


In [13]:
sl_df_list = [pd.read_csv(path) for path in glob.glob(sl_folder + '/*.csv')]

sl_df = pd.concat(sl_df_list, axis=0)

sl_df['Country'] = 'SL'

rename_dict = {
    'date': 'Date',
    'variable': 'Info',
    'National': 'Total'
}
sl_df = sl_df.rename(columns=rename_dict)

sl_df.fillna(0, inplace=True)

sl_df.index = [sl_df['Country'], sl_df['Date']]
sl_df.index.rename(['Country', 'Date'], inplace=True)

apply - mask - to_numeric - replace - groupby - agg - plot


In [14]:
cases_info_to_keep = ['Total cases of confirmed', 'Total confirmed cases', 'cum_confirmed']

mask = sl_df['Info'].apply(lambda x: x in cases_info_to_keep)

sl_df['Cases'] = np.where(mask, pd.to_numeric(sl_df['Total'].replace(',|%', '', regex=True)), 0)

sl_cases = sl_df['Cases'].groupby('Date').agg(sum)

titanic - plots - categorical - cut - value_counts


In [15]:
tit = pd.read_excel(hw01_data_folder+'/titanic.xls')

def change_type(df, col, type_):
    df[col] = df[col].astype(type_)
    
# CATEGORICAL
    
change_type(tit, 'pclass', 'category')
change_type(tit, 'embarked', 'category')
change_type(tit, 'sex', 'category')

# CUT

tit['age'] = pd.cut(tit['age'], np.linspace(0, 90, 10, dtype=int), right=False)

# COUNTPLOT

#sns.countplot(x="pclass", data=tit)
#sns.countplot(x='age', data=tit)

HW02

requests.get(url).json()


In [16]:
timeshighereducation_url = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

ranking_brute = requests.get(timeshighereducation_url).json()['data']

infos_to_keep = ["rank", "name", "location", "stats_number_students", "stats_pc_intl_students", "stats_student_staff_ratio"]
column_names =  ["Rank", "University", "Country", "# Students", "% Int. students", "% Faculty members"]

df_times = pd.DataFrame(ranking_brute[:200], index=range(1, 201), columns=infos_to_keep)

soup


In [17]:
# SEE sciper.csv

sciper = 238122
r = requests.get("https://people.epfl.ch/" + str(sciper))
soup = BeautifulSoup(r.text, "html.parser")

soup.h1, \
soup.find("div", {"class", "parents"}).span.string


Out[17]:
(<h1>Grégoire Clément</h1>, 'SC-MA2')

regex - merge


In [18]:
'''

df2.reset_index().merge(df1, how='inner', on='University_regex').set_index('University')

'''


Out[18]:
"\n\ndf2.reset_index().merge(df1, how='inner', on='University_regex').set_index('University')\n\n"

PCA


In [19]:
from sklearn.decomposition import PCA

HW03

colorscale


In [20]:
rate_min = min(europe_df['Rate'])
rate_max = max(europe_df['Rate'])

color_scale = branca.colormap.linear.OrRd.scale(rate_min, rate_max)
color_scale = color_scale.to_step(n=8)
color_scale.caption = 'Unemployment Rate (%)'

style function


In [21]:
def style_function(country):
    rate = europe_df.loc[europe_df['Name'] == country['properties']['NAME']]['Rate'].values
    if len(rate) > 0: 
        # country is in the dataframe
        return {
            'fillOpacity': 0.8,
            'weight': 0.5,
            'color': 'black',
            'fillColor': color_scale(rate)
        }
    else:
        # country is not in the dataframe, hence we put its color as black
        return {
            'fillOpacity': 0.2,
            'weight': 0.2,
            'color': 'black',
            'fillColor': 'black'
        }

folium


In [22]:
europe_unemployment_map = folium.Map(
    location=[53,22],
    tiles='cartodbpositron',
    zoom_start=4)

In [23]:
color_scale.add_to(europe_unemployment_map)


Out[23]:
3.023.6

In [24]:
g = folium.TopoJson(
    data=europe_json_data,
    object_path='objects.europe',
    style_function=style_function,
).add_to(europe_unemployment_map)

In [25]:
europe_unemployment_map


Out[25]:

HW04

propensity score


In [26]:
lr = LogisticRegression()
#Select features, that is drop id and treat columns
selectedFeatures = lalonde_data.drop(['id','treat'], axis=1)
#Fit the model
lr.fit(selectedFeatures, lalonde_data['treat']);

#Calculate the propensity scores
propensity_scores = lr.predict_proba(selectedFeatures)

#Only keep the probability of receiving the treatment and store it inside the dataframe
lalonde_data['propensity score'] = [x[1] for x in propensity_scores]

In [27]:
# use a full grid over max_depth and n_estimators parameters
param_grid = {
    #"max_depth": [3, 10, 20, None],
    "max_depth": [3, 10],
    "n_estimators": np.linspace(3, 200, num=5, dtype=int)
}

# run grid search
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
'''
grid_search.fit(X_validation, y_validation);
''';

regressor


In [28]:
X_train = None
y_train = None

clf = RandomForestClassifier()

'''
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
''';

Cluedo 01

correlation dataframe to correlation pairs


In [29]:
def df_to_corr_pairs(corr, limit, abs_=True):
    if abs_:
        corr = np.abs(corr)

    indices = corr.index
    corr_pairs = []

    for i, idx_i in enumerate(indices):
        for j, c in enumerate(corr[idx_i]):
            if c > limit and i < j:
                corr_pairs.append((idx_i, indices[j]))
    return corr_pairs

Plots

line plot


In [30]:
sl_cases.plot(kind='line', title='TITLE', grid=True, legend=True, xticks=range(len(sl_cases)));


bar plot


In [31]:
sns.barplot(x="sex", y="survived", hue="pclass", data=tit);


pie


In [32]:
tit['cabin'].dropna().astype(str).str[0].value_counts().plot(kind='pie')
plt.axis('equal');


KDE


In [33]:
sns.kdeplot(lalonde_data['re78'], data2=lalonde_data['age']);


heatmap


In [34]:
sns.heatmap(df.corr(), square=True);


proportions plot


In [35]:
def display_proportions(data, variables, n_cols=3, titles=None):
    N = len(variables)
    f, axes = plt.subplots(nrows=int(np.ceil(N / n_cols)), ncols=n_cols)
    f.set_figheight(10)
    if titles is None:
        titles = range(1, N+1)
    for idx, axis, var in zip(titles, axes.flatten(), variables):
        sns.barplot(x='treat', y=var, data=data, ax=axis)
        axis.set_xticklabels(["Control Group", "Treatment Group"])
        axis.set_xlabel("")
        axis.set_title(idx)
        axis.set_ylabel("mean of {}".format(var))
        
main_var = ['black', 'hispan', 'age', 'married', 'nodegree', 'educ']
display_proportions(lalonde_data, main_var)


Else


In [36]:
# this could have been done in a simpler way for this homework,
# but it might be useful to have such a powerful function for other uses,
# hence we decide to keep it here so that other could use it too :)

def split(X, y, ratios):
    """
    Split X and y given some ratios
    
    Parameters
    ----------
    X : ndarray
    train matrix
    
    y : ndarray
    test matrix
    
    ratios : list(int)
    ratios on how to split X and y

    Returns
    -------
    out : tuple(ndarray)
    Output one tuple of first, the splits of X and then, the splits of y 
    """
    assert np.sum(ratios) < 1, "sum of ratios cannot be greater than 1"
    assert len(ratios) >= 1, "at least one ratio required to split"
    
    def inner_split(X, y, ratios, acc_X, acc_y):
        ratio, *ratios_remaining = ratios
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=ratio)

        if len(ratios_remaining) == 0:
            acc_X.extend([X_train, X_test])
            acc_y.extend([y_train, y_test])
            acc_X.extend(acc_y)
            return tuple(acc_X)
        else:
            acc_X.append(X_train)
            acc_y.append(y_train)
            return inner_split(X_test, y_test, [r/(1.0 - ratio) for r in ratios_remaining], acc_X, acc_y)
    
    return inner_split(X, y, ratios, [], [])

Unit Test

correlation dataframe to correlation pairs


In [37]:
df_to_corr_pairs(ice_cream.corr(), limit=.9)


Out[37]:
[('ice_cream_sales', 'temperature'),
 ('ice_cream_sales', 'deaths_drowning'),
 ('temperature', 'deaths_drowning')]