For a full experience, view on this notebook viewer
In [1]:
# data science
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn import linear_model # or others
# plot
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={"figure.figsize": (15, 6)})
sns.set_palette(sns.color_palette("Set2", 10))
#sns.set_style("whitegrid")
# core
import os
import glob
import pickle
# scraping
import requests # as req
from bs4 import BeautifulSoup
import json
# maps
import folium
import branca
# network
import networkx as nx
# ui - flow
from tqdm import tqdm_notebook as tqdm
In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
In [3]:
ice_cream = pd.read_csv("./data_cluedo/dataset/icecream.csv")
hw01_data_folder = '../01 - Pandas and Data Wrangling/Data'
sl_folder = hw01_data_folder + '/ebola/sl_data'
hw03_data_folder = '../03 - Interactive Viz'
europe_df = pd.read_csv(hw03_data_folder + '/data/european_rate_2016.csv')
europe_topo_path = hw03_data_folder + '/topojson/europe.topojson.json'
europe_json_data = json.load(open(europe_topo_path, encoding="UTF-8"))
hw04_data_folder = '../04 - Applied ML'
lalonde_data = pd.read_csv(hw04_data_folder + '/lalonde.csv')
In [ ]:
In [4]:
df = ice_cream
df.index = ['index ' + str(i) + ' !!' for i in range(12)]
df.index.name = 'index_name'
df.loc['index 10 !!']
df.iloc[0]
df.reset_index()
df['month'].value_counts()
df;
In [5]:
df
Out[5]:
In [6]:
categories_1 = [cat for _ in range(len(df) // 2) for cat in ['A', 'B']]
categories_2 = ['CAT_0' + str(i//4 + 1) for i in range(len(df))]
df['col_index'] = categories_1
df['row_index'] = categories_2
df.pivot_table(index='row_index', columns='col_index', values=['ice_cream_sales', 'month'], aggfunc=[np.min, np.max])
Out[6]:
In [7]:
r = requests.get('https://epfl.ch')
In [8]:
dict_to_save = {
'epfl': r.text,
'test': 'Hi mate!',
}
In [9]:
with open('test.pickle', 'wb') as pickle_file:
pickle.dump(dict_to_save, pickle_file)
In [10]:
with open("test.pickle","rb") as pickle_file:
dict_loaded = pickle.load(pickle_file)
In [11]:
dict_to_save == dict_loaded
Out[11]:
In [12]:
glob.glob('./homeworks/*/*.ipynb')
Out[12]:
In [13]:
sl_df_list = [pd.read_csv(path) for path in glob.glob(sl_folder + '/*.csv')]
sl_df = pd.concat(sl_df_list, axis=0)
sl_df['Country'] = 'SL'
rename_dict = {
'date': 'Date',
'variable': 'Info',
'National': 'Total'
}
sl_df = sl_df.rename(columns=rename_dict)
sl_df.fillna(0, inplace=True)
sl_df.index = [sl_df['Country'], sl_df['Date']]
sl_df.index.rename(['Country', 'Date'], inplace=True)
In [14]:
cases_info_to_keep = ['Total cases of confirmed', 'Total confirmed cases', 'cum_confirmed']
mask = sl_df['Info'].apply(lambda x: x in cases_info_to_keep)
sl_df['Cases'] = np.where(mask, pd.to_numeric(sl_df['Total'].replace(',|%', '', regex=True)), 0)
sl_cases = sl_df['Cases'].groupby('Date').agg(sum)
In [15]:
tit = pd.read_excel(hw01_data_folder+'/titanic.xls')
def change_type(df, col, type_):
df[col] = df[col].astype(type_)
# CATEGORICAL
change_type(tit, 'pclass', 'category')
change_type(tit, 'embarked', 'category')
change_type(tit, 'sex', 'category')
# CUT
tit['age'] = pd.cut(tit['age'], np.linspace(0, 90, 10, dtype=int), right=False)
# COUNTPLOT
#sns.countplot(x="pclass", data=tit)
#sns.countplot(x='age', data=tit)
In [16]:
timeshighereducation_url = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"
ranking_brute = requests.get(timeshighereducation_url).json()['data']
infos_to_keep = ["rank", "name", "location", "stats_number_students", "stats_pc_intl_students", "stats_student_staff_ratio"]
column_names = ["Rank", "University", "Country", "# Students", "% Int. students", "% Faculty members"]
df_times = pd.DataFrame(ranking_brute[:200], index=range(1, 201), columns=infos_to_keep)
In [17]:
# SEE sciper.csv
sciper = 238122
r = requests.get("https://people.epfl.ch/" + str(sciper))
soup = BeautifulSoup(r.text, "html.parser")
soup.h1, \
soup.find("div", {"class", "parents"}).span.string
Out[17]:
In [18]:
'''
df2.reset_index().merge(df1, how='inner', on='University_regex').set_index('University')
'''
Out[18]:
In [19]:
from sklearn.decomposition import PCA
In [20]:
rate_min = min(europe_df['Rate'])
rate_max = max(europe_df['Rate'])
color_scale = branca.colormap.linear.OrRd.scale(rate_min, rate_max)
color_scale = color_scale.to_step(n=8)
color_scale.caption = 'Unemployment Rate (%)'
In [21]:
def style_function(country):
rate = europe_df.loc[europe_df['Name'] == country['properties']['NAME']]['Rate'].values
if len(rate) > 0:
# country is in the dataframe
return {
'fillOpacity': 0.8,
'weight': 0.5,
'color': 'black',
'fillColor': color_scale(rate)
}
else:
# country is not in the dataframe, hence we put its color as black
return {
'fillOpacity': 0.2,
'weight': 0.2,
'color': 'black',
'fillColor': 'black'
}
In [22]:
europe_unemployment_map = folium.Map(
location=[53,22],
tiles='cartodbpositron',
zoom_start=4)
In [23]:
color_scale.add_to(europe_unemployment_map)
Out[23]:
In [24]:
g = folium.TopoJson(
data=europe_json_data,
object_path='objects.europe',
style_function=style_function,
).add_to(europe_unemployment_map)
In [25]:
europe_unemployment_map
Out[25]:
In [26]:
lr = LogisticRegression()
#Select features, that is drop id and treat columns
selectedFeatures = lalonde_data.drop(['id','treat'], axis=1)
#Fit the model
lr.fit(selectedFeatures, lalonde_data['treat']);
#Calculate the propensity scores
propensity_scores = lr.predict_proba(selectedFeatures)
#Only keep the probability of receiving the treatment and store it inside the dataframe
lalonde_data['propensity score'] = [x[1] for x in propensity_scores]
In [27]:
# use a full grid over max_depth and n_estimators parameters
param_grid = {
#"max_depth": [3, 10, 20, None],
"max_depth": [3, 10],
"n_estimators": np.linspace(3, 200, num=5, dtype=int)
}
# run grid search
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
'''
grid_search.fit(X_validation, y_validation);
''';
In [28]:
X_train = None
y_train = None
clf = RandomForestClassifier()
'''
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
''';
In [29]:
def df_to_corr_pairs(corr, limit, abs_=True):
if abs_:
corr = np.abs(corr)
indices = corr.index
corr_pairs = []
for i, idx_i in enumerate(indices):
for j, c in enumerate(corr[idx_i]):
if c > limit and i < j:
corr_pairs.append((idx_i, indices[j]))
return corr_pairs
In [30]:
sl_cases.plot(kind='line', title='TITLE', grid=True, legend=True, xticks=range(len(sl_cases)));
In [31]:
sns.barplot(x="sex", y="survived", hue="pclass", data=tit);
In [32]:
tit['cabin'].dropna().astype(str).str[0].value_counts().plot(kind='pie')
plt.axis('equal');
In [33]:
sns.kdeplot(lalonde_data['re78'], data2=lalonde_data['age']);
In [34]:
sns.heatmap(df.corr(), square=True);
In [35]:
def display_proportions(data, variables, n_cols=3, titles=None):
N = len(variables)
f, axes = plt.subplots(nrows=int(np.ceil(N / n_cols)), ncols=n_cols)
f.set_figheight(10)
if titles is None:
titles = range(1, N+1)
for idx, axis, var in zip(titles, axes.flatten(), variables):
sns.barplot(x='treat', y=var, data=data, ax=axis)
axis.set_xticklabels(["Control Group", "Treatment Group"])
axis.set_xlabel("")
axis.set_title(idx)
axis.set_ylabel("mean of {}".format(var))
main_var = ['black', 'hispan', 'age', 'married', 'nodegree', 'educ']
display_proportions(lalonde_data, main_var)
In [36]:
# this could have been done in a simpler way for this homework,
# but it might be useful to have such a powerful function for other uses,
# hence we decide to keep it here so that other could use it too :)
def split(X, y, ratios):
"""
Split X and y given some ratios
Parameters
----------
X : ndarray
train matrix
y : ndarray
test matrix
ratios : list(int)
ratios on how to split X and y
Returns
-------
out : tuple(ndarray)
Output one tuple of first, the splits of X and then, the splits of y
"""
assert np.sum(ratios) < 1, "sum of ratios cannot be greater than 1"
assert len(ratios) >= 1, "at least one ratio required to split"
def inner_split(X, y, ratios, acc_X, acc_y):
ratio, *ratios_remaining = ratios
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=ratio)
if len(ratios_remaining) == 0:
acc_X.extend([X_train, X_test])
acc_y.extend([y_train, y_test])
acc_X.extend(acc_y)
return tuple(acc_X)
else:
acc_X.append(X_train)
acc_y.append(y_train)
return inner_split(X_test, y_test, [r/(1.0 - ratio) for r in ratios_remaining], acc_X, acc_y)
return inner_split(X, y, ratios, [], [])
In [37]:
df_to_corr_pairs(ice_cream.corr(), limit=.9)
Out[37]: