Solución del proyecto de comentarios de Facebook

(Clasificador)

Es importante que tengan instaladas todas las bibliotecas que se importan a continuación, incluyendo ipywidgets para mejor visualización de este notebook. Además, la versión de Jupyter debe ser la más reciente.

Todo se puede instalar mediante pip.


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from ipywidgets import interact, interact_manual
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import regression
from sklearn.pipeline import Pipeline
from pprint import pprint

Carga de los datos

Parámetros de la función load_data:

  • train_f: rutas hacia los datos de entrenamiento en .csv
  • test_f: rutas hacia los datos de evaluación en .csv
  • join: booleano que indica si se deben cargar TODOS los datos de train_f y test_f y concatenarlos en un solo pd.DataFrame

load_data construye dos objetos de tipo pd.DataFrame que guardan los datos cargados.


In [2]:
training = []
testing = []

train_dir = 'data/facebook-data/Training/'
test_dir = 'data/facebook-data/Testing/TestSet/'
train_files = [os.path.join(train_dir, f) 
               for f in os.listdir(train_dir)
               if f.endswith('.csv')]
test_files = [os.path.join(test_dir, f) 
              for f in os.listdir(test_dir)
              if f.endswith('.csv')]

@interact(train_f=train_files, test_f=test_files, join=False)
def load_data(train_f, test_f, join=False):
    global training, testing, train_files, test_files
    if not join:
        training = pd.read_csv(train_f, header=None)
        testing = pd.read_csv(test_f, header=None)
    else:
        list1 = [pd.read_csv(tr, header=None) for tr in train_files]
        list2 = [pd.read_csv(te, header=None) for te in test_files]
        training = pd.concat(list1)
        testing = pd.concat(list2)
    print(training.shape, testing.shape)



In [19]:
training


Out[19]:
0 1 2 3 4 5 6 7 8 9 ... 44 45 46 47 48 49 50 51 52 53
0 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
1 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
2 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 0 0 0 0 0 1 0
3 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 1 0 0 0 0 0 0
4 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 1 0 0 0 0
5 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
6 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
7 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
8 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 0 0 0 0 0 1 0
9 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 1 0 0 0 0 0 0 0
10 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 1 0 0 0 0 0 0 0
11 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 1 0 0 0
12 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 1 0 0 0 0 3
13 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
14 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 1 0 0 0 0 0
15 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 1 0 0 0
16 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
17 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
18 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
19 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 0 0 0 0 0 1 0
20 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 1 0 0 0
21 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 1 0 0 0 5
22 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
23 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
24 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 1 0 0 0 0 0 0 0
25 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 1 0 0 0 0 0 0 0
26 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 0 0 0 0 0 1 2
27 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 1 0 0 0 0
28 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 1 0 0 0 0
29 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
40919 309914 13 5432 26 0.0 375.0 8.594262 4.0 25.369146 0.0 ... 0 0 0 0 0 0 1 0 0 0
40920 309914 13 5432 26 0.0 375.0 8.594262 4.0 25.369146 0.0 ... 0 0 0 0 0 0 0 1 0 0
40921 309914 13 5432 26 0.0 375.0 8.594262 4.0 25.369146 0.0 ... 0 0 0 0 0 1 0 0 0 2
40922 309914 13 5432 26 0.0 375.0 8.594262 4.0 25.369146 0.0 ... 0 0 0 0 0 0 1 0 0 0
40923 309914 13 5432 26 0.0 375.0 8.594262 4.0 25.369146 0.0 ... 0 0 0 0 0 0 0 1 0 0
40924 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 1 0 0 0 0 0 0 1 0 79
40925 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 1 0 1 0 0 0 0 0 0 7
40926 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 1 1 0 0 0 0 0 0 9
40927 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 1 1 0 0 0 0 0 0 17
40928 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 1 1 0 0 0 0 0 0 67
40929 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 1 1 0 0 0 0 0 0 19
40930 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 1 0 0 0 0 0 0 128
40931 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 1 0 0 0 0 0 298
40932 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 1 0 0 0 0 1
40933 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 1 0 0 0 0 0 41
40934 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 1 0 0 0 0 5
40935 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 1 0 0 4
40936 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 1 0 0 0 0 0 211
40937 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 1 0 0 0 0 12
40938 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 1 0 0 0
40939 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 1 0 0 0 0 90
40940 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 1 0 0 6
40941 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 1 0 0 0 5
40942 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 1 0 0 0 0 254
40943 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 1 0 0 0 7
40944 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 0 1 0 1
40945 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 0 1 0 2
40946 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 1 0 0 72
40947 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 0 1 0 28
40948 7170111 70 497000 9 0.0 1881.0 497.200000 269.0 502.318385 0.0 ... 0 0 0 0 0 0 0 1 0 11

40949 rows × 54 columns


In [20]:
# Define needed np arrays
X_train = training.get_values()[:, :-1]
Y_train = training.get_values()[:, -1:]
X_test = testing.get_values()[:, :-1]
Y_test = testing.get_values()[:, -1:]

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape


Out[20]:
((40949, 53), (40949, 1), (100, 53), (100, 1))

In [21]:
# One-hot encode feature #3
encoder = OneHotEncoder(categorical_features=[3])
encoder.fit(X_train)
X_train = encoder.transform(X_train).toarray()
X_test = encoder.transform(X_test).toarray()
X_train.shape, X_test.shape, encoder.feature_indices_, encoder.active_features_


Out[21]:
((40949, 133),
 (100, 133),
 array([  0, 107]),
 array([  1,   2,   3,   4,   5,   6,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  38,  39,  40,  42,
         44,  45,  46,  47,  49,  50,  51,  54,  55,  56,  57,  58,  59,
         60,  61,  62,  63,  66,  67,  68,  72,  73,  75,  76,  77,  79,
         80,  81,  82,  83,  85,  87,  89,  90,  91,  92,  93,  96, 100,
        101, 105, 106]))

Regresión Lineal


In [6]:
linear_reg = LinearRegression(normalize=True)
linear_reg.fit(X_train, Y_train)


/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
Out[6]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [7]:
# Testing
Y_pred = linear_reg.predict(X_test)
Y_pred[:15]


Out[7]:
array([[  7.75195312e+00],
       [  1.01093750e+01],
       [  2.99130167e+11],
       [ -7.89062500e-01],
       [ -2.21875000e+00],
       [  1.95105469e+02],
       [  8.81054688e+00],
       [  4.58789062e+00],
       [  2.99130167e+11],
       [ -3.65234375e+00],
       [ -9.53320312e+00],
       [  2.51484375e+01],
       [  1.31230469e+01],
       [  1.40195312e+01],
       [ -2.53125000e+00]])

In [8]:
# Scoring
evs = regression.explained_variance_score(Y_test, Y_pred)
mae = regression.mean_absolute_error(Y_test, Y_pred)
mse = regression.mean_squared_error(Y_test, Y_pred)
mabser = regression.median_absolute_error(Y_test, Y_pred)
r2 = regression.r2_score(Y_test, Y_pred)
pprint({
    'explained_variance_score': evs,
    'mean_absolute_error': mae,
    'mean_squared_error': mse,
    'median_absolute_error': mae,
    'r2_score': r2
})


{'explained_variance_score': -8.2509327038841523e+17,
 'mean_absolute_error': 20939111728.671738,
 'mean_squared_error': 6.2635199852279266e+21,
 'median_absolute_error': 20939111728.671738,
 'r2_score': -8.871970648450281e+17}

Regresión polinómica


In [14]:
reg_dict = {
    LinearRegression.__name__: LinearRegression,
    Ridge.__name__: Ridge,
    Lasso.__name__: Lasso
}

model = None
@interact_manual(degree=(2, 10), regressor=reg_dict)
def poly_reg_pipeline(degree=3, regressor=LinearRegression):
    global model
    poly_features = PolynomialFeatures(degree)
    reg = regressor(normalize=True)
    print('building model...')
    model = Pipeline([('poly_features', poly_features), ('regressor', reg)])
    print('training...')
    model.fit(X_train, Y_train)



In [ ]:
# Testing
Y_pred = model.predict(X_test)
Y_pred[:15]

In [ ]:
# Scoring
evs = regression.explained_variance_score(Y_test, Y_pred)
mae = regression.mean_absolute_error(Y_test, Y_pred)
mse = regression.mean_squared_error(Y_test, Y_pred)
mabser = regression.median_absolute_error(Y_test, Y_pred)
r2 = regression.r2_score(Y_test, Y_pred)
pprint({
    'explained_variance_score': evs,
    'mean_absolute_error': mae,
    'mean_squared_error': mse,
    'median_absolute_error': mae,
    'r2_score': r2
})