In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder
In [2]:
df = pd.read_csv("./dataset_Facebook.csv", delimiter = ";")
In [3]:
features = ["Category",
"Page total likes",
"Type",
"Post Month",
"Post Hour",
"Post Weekday",
"Paid"]
df[features].head()
Out[3]:
In [4]:
outcomes= ["Lifetime Post Total Reach",
"Lifetime Post Total Impressions",
"Lifetime Engaged Users",
"Lifetime Post Consumers",
"Lifetime Post Consumptions",
"Lifetime Post Impressions by people who have liked your Page",
"Lifetime Post reach by people who like your Page",
"Lifetime People who have liked your Page and engaged with your post",
"comment",
"like",
"share",
"Total Interactions"]
df[outcomes].head()
Out[4]:
In [5]:
# convert a string variable to a categorical one
#types = list(set(df["Type"]))
#to_categorical = {types[i]:i for i in range(len(types))}
#df["Type"] = df["Type"].apply(lambda x: to_categorical[x])
df[["Type"]] = df[["Type"]].apply(LabelEncoder().fit_transform)
df.head()
Out[5]:
Now let's prepare the data by cleaning it up and choosing the relevant column we would like to predict
We can now use the bootstrap to find an approximation of the bias and the variance
In [6]:
df = df.dropna()
#df = df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]
outcomes_of_interest = ["Lifetime Post Consumers", "like"]
X_df = df[features].copy()
y_df = df[outcomes_of_interest].copy()
cat_features = ["Category",
"Type",
"Paid"]
X_df = pd.get_dummies(X_df, columns = cat_features)
print X_df.head()[["Category_1", "Category_2","Category_3"]].to_latex()
X = X_df.values
y = y_df.values.T[0]
y = (y-y.min())/(y.max() - y.min())
# # # import seaborn as sns
# y_df['id'] = range(1, len(df) + 1)
#y_df.head()
# sns_plot = sns.lmplot(x="id", y= attribute, data=y_df, fit_reg=False, aspect = 2)
# sns_plot.savefig("scaterplot_lpc.png",bbox_inches='tight')
# sns_plot.savefig("scaterplot_lpc.pdf",bbox_inches='tight')
# sns_plot = sns.jointplot(x="Lifetime Post Consumers", y="like", data=y_df, ratio = 2)
# sns_plot.savefig("joint_plot.png",bbox_inches='tight')
# sns_plot.savefig("joint_plot.pdf",bbox_inches='tight')
# sns.distplot(y, kde=False, rug=True)
# sns_plot.savefig("histogram_lpc.png",bbox_inches='tight')
# sns_plot.savefig("histogram_lpc.pdf",bbox_inches='tight')
In [7]:
n_test = 100
n_repeat = 1000
#estimator = DecisionTreeRegressor()
estimator = RandomForestRegressor()
#estimator = BayesianRidge(normalize = True)
# Compute predictions
y_predicts = np.zeros((n_repeat, len(X)))
#stdy = y/y.max()
for i in range(n_repeat):
sample = np.random.choice(range(len(X)),replace = True, size = len(X))
train_ids = sample[:-n_test]
test_ids = sample[-n_test:]
test_ids = np.setdiff1d(test_ids, train_ids)
if(len(test_ids) == 0 ):
continue
X_train,y_train = X[train_ids], y[train_ids]
X_test, y_test = X[test_ids], y[test_ids]
estimator.fit(X_train, y_train)
y_predict = estimator.predict(X_test)
y_predicts[i,test_ids] = y_predict
In [8]:
y_bias = (y - np.mean(y_predicts, axis=0)) **2
y_error = ((y - y_predicts) **2).mean()
y_var = np.var(y_predicts, axis=0, ddof = 1)
print np.mean(y_bias) + np.mean(y_var)
clf_type = "Decision tree"
print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) "
"+ {3:.4f} (var)".format(clf_type,
np.mean(y_error),
np.mean(y_bias),
np.mean(y_var)))
print("{0}: {1:.4f} ((bias^2) + (var)) = {2:.4f} (bias^2) "
"+ {3:.4f} (var)".format(clf_type,
np.mean(y_bias) + np.mean(y_var),
np.mean(y_bias),
np.mean(y_var)))
In [9]:
clf = RandomForestRegressor(n_estimators = 1000,max_depth = 2)
from sklearn.linear_model import SGDRegressor, BayesianRidge
clf = BayesianRidge(normalize = True)
dummy_clf = DummyRegressor()
scores = cross_val_score(clf, X, y, cv=10,scoring = make_scorer(mse))
dummy_scores = cross_val_score(dummy_clf, X, y, cv=10, scoring = make_scorer(mse))
print("MSE: %0.8f (+/- %0.8f)" % (scores.mean(), scores.std()))
print("Dummy MSE: %0.8f (+/- %0.8f)" % (dummy_scores.mean(), dummy_scores.std()))
#print clf
Now let's train the regressor on the whole dataset
In [10]:
#clf = RandomForestRegressor(n_estimators = 500, criterion = "mse")
#clf = DecisionTreeRegressor()
# from sklearn.ensemble import BaggingRegressor
# from sklearn.ensemble import AdaBoostRegressor
clf =BayesianRidge()
print X.shape
# clf = AdaBoostRegressor(DecisionTreeRegressor(),n_estimators = 1000)#
print X.shape, y.shape
stdy = y
clf.fit(X,stdy)
print mse(stdy,clf.predict(X))