Es importante que tengan instaladas todas las bibliotecas que se importan a continuación,
incluyendo ipywidgets
para mejor visualización de este notebook. Además, la versión
de Jupyter debe ser la más reciente.
Todo se puede instalar mediante pip
.
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from ipywidgets import interact, interact_manual
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import regression
from sklearn.pipeline import Pipeline
from pprint import pprint
Parámetros de la función load_data
:
train_f
: rutas hacia los datos de entrenamiento en .csv
test_f
: rutas hacia los datos de evaluación en .csv
join
: booleano que indica si se deben cargar TODOS los datos de
train_f
y test_f
y concatenarlos en un solo pd.DataFrame
load_data
construye dos objetos de tipo pd.DataFrame
que guardan los datos cargados.
In [2]:
training = []
testing = []
train_dir = 'data/facebook-data/Training/'
test_dir = 'data/facebook-data/Testing/TestSet/'
train_files = [os.path.join(train_dir, f)
for f in os.listdir(train_dir)
if f.endswith('.csv')]
test_files = [os.path.join(test_dir, f)
for f in os.listdir(test_dir)
if f.endswith('.csv')]
@interact(train_f=train_files, test_f=test_files, join=False)
def load_data(train_f, test_f, join=False):
global training, testing, train_files, test_files
if not join:
training = pd.read_csv(train_f, header=None)
testing = pd.read_csv(test_f, header=None)
else:
list1 = [pd.read_csv(tr, header=None) for tr in train_files]
list2 = [pd.read_csv(te, header=None) for te in test_files]
training = pd.concat(list1)
testing = pd.concat(list2)
print(training.shape, testing.shape)
In [19]:
training
Out[19]:
In [20]:
# Define needed np arrays
X_train = training.get_values()[:, :-1]
Y_train = training.get_values()[:, -1:]
X_test = testing.get_values()[:, :-1]
Y_test = testing.get_values()[:, -1:]
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape
Out[20]:
In [21]:
# One-hot encode feature #3
encoder = OneHotEncoder(categorical_features=[3])
encoder.fit(X_train)
X_train = encoder.transform(X_train).toarray()
X_test = encoder.transform(X_test).toarray()
X_train.shape, X_test.shape, encoder.feature_indices_, encoder.active_features_
Out[21]:
In [6]:
linear_reg = LinearRegression(normalize=True)
linear_reg.fit(X_train, Y_train)
Out[6]:
In [7]:
# Testing
Y_pred = linear_reg.predict(X_test)
Y_pred[:15]
Out[7]:
In [8]:
# Scoring
evs = regression.explained_variance_score(Y_test, Y_pred)
mae = regression.mean_absolute_error(Y_test, Y_pred)
mse = regression.mean_squared_error(Y_test, Y_pred)
mabser = regression.median_absolute_error(Y_test, Y_pred)
r2 = regression.r2_score(Y_test, Y_pred)
pprint({
'explained_variance_score': evs,
'mean_absolute_error': mae,
'mean_squared_error': mse,
'median_absolute_error': mae,
'r2_score': r2
})
In [14]:
reg_dict = {
LinearRegression.__name__: LinearRegression,
Ridge.__name__: Ridge,
Lasso.__name__: Lasso
}
model = None
@interact_manual(degree=(2, 10), regressor=reg_dict)
def poly_reg_pipeline(degree=3, regressor=LinearRegression):
global model
poly_features = PolynomialFeatures(degree)
reg = regressor(normalize=True)
print('building model...')
model = Pipeline([('poly_features', poly_features), ('regressor', reg)])
print('training...')
model.fit(X_train, Y_train)
In [ ]:
# Testing
Y_pred = model.predict(X_test)
Y_pred[:15]
In [ ]:
# Scoring
evs = regression.explained_variance_score(Y_test, Y_pred)
mae = regression.mean_absolute_error(Y_test, Y_pred)
mse = regression.mean_squared_error(Y_test, Y_pred)
mabser = regression.median_absolute_error(Y_test, Y_pred)
r2 = regression.r2_score(Y_test, Y_pred)
pprint({
'explained_variance_score': evs,
'mean_absolute_error': mae,
'mean_squared_error': mse,
'median_absolute_error': mae,
'r2_score': r2
})