In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder
  • We have loaded the necessary libraries above
  • Now let's load the data

In [2]:
df = pd.read_csv("./dataset_Facebook.csv", delimiter = ";")

In [3]:
features = ["Category",
            "Page total likes",
            "Type",
            "Post Month",
            "Post Hour",
            "Post Weekday",
            "Paid"]


df[features].head()


Out[3]:
Category Page total likes Type Post Month Post Hour Post Weekday Paid
0 2 139441 Photo 12 3 4 0.0
1 2 139441 Status 12 10 3 0.0
2 3 139441 Photo 12 3 3 0.0
3 2 139441 Photo 12 10 2 1.0
4 2 139441 Photo 12 3 2 0.0

In [4]:
outcomes=  ["Lifetime Post Total Reach",
            "Lifetime Post Total Impressions",
            "Lifetime Engaged Users",
            "Lifetime Post Consumers",
            "Lifetime Post Consumptions",
            "Lifetime Post Impressions by people who have liked your Page",
            "Lifetime Post reach by people who like your Page",
            "Lifetime People who have liked your Page and engaged with your post",
            "comment",
            "like",
            "share",
            "Total Interactions"]

df[outcomes].head()


Out[4]:
Lifetime Post Total Reach Lifetime Post Total Impressions Lifetime Engaged Users Lifetime Post Consumers Lifetime Post Consumptions Lifetime Post Impressions by people who have liked your Page Lifetime Post reach by people who like your Page Lifetime People who have liked your Page and engaged with your post comment like share Total Interactions
0 2752 5091 178 109 159 3078 1640 119 4 79.0 17.0 100
1 10460 19057 1457 1361 1674 11710 6112 1108 5 130.0 29.0 164
2 2413 4373 177 113 154 2812 1503 132 0 66.0 14.0 80
3 50128 87991 2211 790 1119 61027 32048 1386 58 1572.0 147.0 1777
4 7244 13594 671 410 580 6228 3200 396 19 325.0 49.0 393

In [5]:
# convert a string variable to a categorical one
#types = list(set(df["Type"]))
#to_categorical = {types[i]:i for i in range(len(types))}
#df["Type"] = df["Type"].apply(lambda x: to_categorical[x])

df[["Type"]] = df[["Type"]].apply(LabelEncoder().fit_transform)

df.head()


Out[5]:
Page total likes Type Category Post Month Post Weekday Post Hour Paid Lifetime Post Total Reach Lifetime Post Total Impressions Lifetime Engaged Users Lifetime Post Consumers Lifetime Post Consumptions Lifetime Post Impressions by people who have liked your Page Lifetime Post reach by people who like your Page Lifetime People who have liked your Page and engaged with your post comment like share Total Interactions
0 139441 1 2 12 4 3 0.0 2752 5091 178 109 159 3078 1640 119 4 79.0 17.0 100
1 139441 2 2 12 3 10 0.0 10460 19057 1457 1361 1674 11710 6112 1108 5 130.0 29.0 164
2 139441 1 3 12 3 3 0.0 2413 4373 177 113 154 2812 1503 132 0 66.0 14.0 80
3 139441 1 2 12 2 10 1.0 50128 87991 2211 790 1119 61027 32048 1386 58 1572.0 147.0 1777
4 139441 1 2 12 2 3 0.0 7244 13594 671 410 580 6228 3200 396 19 325.0 49.0 393

Now let's prepare the data by cleaning it up and choosing the relevant column we would like to predict

We can now use the bootstrap to find an approximation of the bias and the variance


In [6]:
df = df.dropna()

#df = df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]



outcomes_of_interest = ["Lifetime Post Consumers", "like"]

X_df = df[features].copy()
y_df = df[outcomes_of_interest].copy()

cat_features = ["Category",
            "Type",

            "Paid"]


X_df = pd.get_dummies(X_df, columns = cat_features)

print X_df.head()[["Category_1", "Category_2","Category_3"]].to_latex()

X = X_df.values
y = y_df.values.T[0]

y = (y-y.min())/(y.max() - y.min())

# # # import seaborn as sns

# y_df['id'] = range(1, len(df) + 1)

#y_df.head()
# sns_plot = sns.lmplot(x="id", y= attribute, data=y_df, fit_reg=False, aspect = 2)

# sns_plot.savefig("scaterplot_lpc.png",bbox_inches='tight')
# sns_plot.savefig("scaterplot_lpc.pdf",bbox_inches='tight')


# sns_plot = sns.jointplot(x="Lifetime Post Consumers", y="like", data=y_df, ratio = 2)

# sns_plot.savefig("joint_plot.png",bbox_inches='tight')
# sns_plot.savefig("joint_plot.pdf",bbox_inches='tight')

# sns.distplot(y, kde=False, rug=True)

# sns_plot.savefig("histogram_lpc.png",bbox_inches='tight')
# sns_plot.savefig("histogram_lpc.pdf",bbox_inches='tight')


\begin{tabular}{lrrr}
\toprule
{} &  Category\_1 &  Category\_2 &  Category\_3 \\
\midrule
0 &         0.0 &         1.0 &         0.0 \\
1 &         0.0 &         1.0 &         0.0 \\
2 &         0.0 &         0.0 &         1.0 \\
3 &         0.0 &         1.0 &         0.0 \\
4 &         0.0 &         1.0 &         0.0 \\
\bottomrule
\end{tabular}


In [7]:
n_test = 100
n_repeat = 1000

#estimator = DecisionTreeRegressor()
estimator = RandomForestRegressor()
#estimator = BayesianRidge(normalize = True)


# Compute predictions
y_predicts = np.zeros((n_repeat, len(X)))
#stdy = y/y.max()

for i in range(n_repeat):
    
    sample  = np.random.choice(range(len(X)),replace = True, size = len(X))
    
    train_ids = sample[:-n_test]
    test_ids  = sample[-n_test:]
    test_ids = np.setdiff1d(test_ids, train_ids)
    if(len(test_ids) == 0 ):
        continue

    X_train,y_train = X[train_ids], y[train_ids]
    X_test, y_test = X[test_ids], y[test_ids]
    
    estimator.fit(X_train, y_train)
    y_predict = estimator.predict(X_test)
    y_predicts[i,test_ids] = y_predict

In [8]:
y_bias = (y - np.mean(y_predicts, axis=0)) **2

y_error = ((y - y_predicts) **2).mean()
y_var = np.var(y_predicts, axis=0, ddof = 1)


print np.mean(y_bias) +  np.mean(y_var)

clf_type = "Decision tree"
print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) "
          "+ {3:.4f} (var)".format(clf_type,
                                                      np.mean(y_error),
                                                      np.mean(y_bias),
                                                      np.mean(y_var)))

print("{0}: {1:.4f} ((bias^2) + (var)) = {2:.4f} (bias^2) "
          "+ {3:.4f} (var)".format(clf_type,
                                                      np.mean(y_bias) + np.mean(y_var),
                                                      np.mean(y_bias),
                                                      np.mean(y_var)))


0.01071066406
Decision tree: 0.0107 (error) = 0.0100 (bias^2) + 0.0008 (var)
Decision tree: 0.0107 ((bias^2) + (var)) = 0.0100 (bias^2) + 0.0008 (var)

In [9]:
clf = RandomForestRegressor(n_estimators = 1000,max_depth = 2)
from sklearn.linear_model import SGDRegressor, BayesianRidge

clf = BayesianRidge(normalize = True)


dummy_clf = DummyRegressor()
scores = cross_val_score(clf, X, y, cv=10,scoring = make_scorer(mse))
dummy_scores = cross_val_score(dummy_clf, X, y, cv=10, scoring = make_scorer(mse))

print("MSE: %0.8f (+/- %0.8f)" % (scores.mean(), scores.std()))
print("Dummy MSE: %0.8f (+/- %0.8f)" % (dummy_scores.mean(), dummy_scores.std()))

#print clf


MSE: 0.00479575 (+/- 0.00511744)
Dummy MSE: 0.00616020 (+/- 0.00521769)

Now let's train the regressor on the whole dataset


In [10]:
#clf = RandomForestRegressor(n_estimators = 500, criterion = "mse")
#clf = DecisionTreeRegressor()
# from sklearn.ensemble import BaggingRegressor
# from sklearn.ensemble import AdaBoostRegressor
clf =BayesianRidge()
print X.shape

# clf = AdaBoostRegressor(DecisionTreeRegressor(),n_estimators = 1000)#

print X.shape, y.shape
stdy = y
clf.fit(X,stdy)

print mse(stdy,clf.predict(X))


(495, 13)
(495, 13) (495,)
0.00442525939709