notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder

We have loaded the necessary libraries above
Now let's load the data



In [2]:

    
df = pd.read_csv("./dataset_Facebook.csv", delimiter = ";")



In [3]:

    
features = ["Category",
            "Page total likes",
            "Type",
            "Post Month",
            "Post Hour",
            "Post Weekday",
            "Paid"]


df[features].head()









    Out[3]:






  
    
      
      Category
      Page total likes
      Type
      Post Month
      Post Hour
      Post Weekday
      Paid
    
  
  
    
      0
      2
      139441
      Photo
      12
      3
      4
      0.0
    
    
      1
      2
      139441
      Status
      12
      10
      3
      0.0
    
    
      2
      3
      139441
      Photo
      12
      3
      3
      0.0
    
    
      3
      2
      139441
      Photo
      12
      10
      2
      1.0
    
    
      4
      2
      139441
      Photo
      12
      3
      2
      0.0



In [4]:

    
outcomes=  ["Lifetime Post Total Reach",
            "Lifetime Post Total Impressions",
            "Lifetime Engaged Users",
            "Lifetime Post Consumers",
            "Lifetime Post Consumptions",
            "Lifetime Post Impressions by people who have liked your Page",
            "Lifetime Post reach by people who like your Page",
            "Lifetime People who have liked your Page and engaged with your post",
            "comment",
            "like",
            "share",
            "Total Interactions"]

df[outcomes].head()









    Out[4]:






  
    
      
      Lifetime Post Total Reach
      Lifetime Post Total Impressions
      Lifetime Engaged Users
      Lifetime Post Consumers
      Lifetime Post Consumptions
      Lifetime Post Impressions by people who have liked your Page
      Lifetime Post reach by people who like your Page
      Lifetime People who have liked your Page and engaged with your post
      comment
      like
      share
      Total Interactions
    
  
  
    
      0
      2752
      5091
      178
      109
      159
      3078
      1640
      119
      4
      79.0
      17.0
      100
    
    
      1
      10460
      19057
      1457
      1361
      1674
      11710
      6112
      1108
      5
      130.0
      29.0
      164
    
    
      2
      2413
      4373
      177
      113
      154
      2812
      1503
      132
      0
      66.0
      14.0
      80
    
    
      3
      50128
      87991
      2211
      790
      1119
      61027
      32048
      1386
      58
      1572.0
      147.0
      1777
    
    
      4
      7244
      13594
      671
      410
      580
      6228
      3200
      396
      19
      325.0
      49.0
      393



In [5]:

    
# convert a string variable to a categorical one
#types = list(set(df["Type"]))
#to_categorical = {types[i]:i for i in range(len(types))}
#df["Type"] = df["Type"].apply(lambda x: to_categorical[x])

df[["Type"]] = df[["Type"]].apply(LabelEncoder().fit_transform)

df.head()









    Out[5]:






  
    
      
      Page total likes
      Type
      Category
      Post Month
      Post Weekday
      Post Hour
      Paid
      Lifetime Post Total Reach
      Lifetime Post Total Impressions
      Lifetime Engaged Users
      Lifetime Post Consumers
      Lifetime Post Consumptions
      Lifetime Post Impressions by people who have liked your Page
      Lifetime Post reach by people who like your Page
      Lifetime People who have liked your Page and engaged with your post
      comment
      like
      share
      Total Interactions
    
  
  
    
      0
      139441
      1
      2
      12
      4
      3
      0.0
      2752
      5091
      178
      109
      159
      3078
      1640
      119
      4
      79.0
      17.0
      100
    
    
      1
      139441
      2
      2
      12
      3
      10
      0.0
      10460
      19057
      1457
      1361
      1674
      11710
      6112
      1108
      5
      130.0
      29.0
      164
    
    
      2
      139441
      1
      3
      12
      3
      3
      0.0
      2413
      4373
      177
      113
      154
      2812
      1503
      132
      0
      66.0
      14.0
      80
    
    
      3
      139441
      1
      2
      12
      2
      10
      1.0
      50128
      87991
      2211
      790
      1119
      61027
      32048
      1386
      58
      1572.0
      147.0
      1777
    
    
      4
      139441
      1
      2
      12
      2
      3
      0.0
      7244
      13594
      671
      410
      580
      6228
      3200
      396
      19
      325.0
      49.0
      393

Now let's prepare the data by cleaning it up and choosing the relevant column we would like to predict

We can now use the bootstrap to find an approximation of the bias and the variance



In [6]:

    
df = df.dropna()

#df = df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]



outcomes_of_interest = ["Lifetime Post Consumers", "like"]

X_df = df[features].copy()
y_df = df[outcomes_of_interest].copy()

cat_features = ["Category",
            "Type",

            "Paid"]


X_df = pd.get_dummies(X_df, columns = cat_features)

print X_df.head()[["Category_1", "Category_2","Category_3"]].to_latex()

X = X_df.values
y = y_df.values.T[0]

y = (y-y.min())/(y.max() - y.min())

# # # import seaborn as sns

# y_df['id'] = range(1, len(df) + 1)

#y_df.head()
# sns_plot = sns.lmplot(x="id", y= attribute, data=y_df, fit_reg=False, aspect = 2)

# sns_plot.savefig("scaterplot_lpc.png",bbox_inches='tight')
# sns_plot.savefig("scaterplot_lpc.pdf",bbox_inches='tight')


# sns_plot = sns.jointplot(x="Lifetime Post Consumers", y="like", data=y_df, ratio = 2)

# sns_plot.savefig("joint_plot.png",bbox_inches='tight')
# sns_plot.savefig("joint_plot.pdf",bbox_inches='tight')

# sns.distplot(y, kde=False, rug=True)

# sns_plot.savefig("histogram_lpc.png",bbox_inches='tight')
# sns_plot.savefig("histogram_lpc.pdf",bbox_inches='tight')









    



\begin{tabular}{lrrr}
\toprule
{} &  Category\_1 &  Category\_2 &  Category\_3 \\
\midrule
0 &         0.0 &         1.0 &         0.0 \\
1 &         0.0 &         1.0 &         0.0 \\
2 &         0.0 &         0.0 &         1.0 \\
3 &         0.0 &         1.0 &         0.0 \\
4 &         0.0 &         1.0 &         0.0 \\
\bottomrule
\end{tabular}



In [7]:

    
n_test = 100
n_repeat = 1000

#estimator = DecisionTreeRegressor()
estimator = RandomForestRegressor()
#estimator = BayesianRidge(normalize = True)


# Compute predictions
y_predicts = np.zeros((n_repeat, len(X)))
#stdy = y/y.max()

for i in range(n_repeat):
    
    sample  = np.random.choice(range(len(X)),replace = True, size = len(X))
    
    train_ids = sample[:-n_test]
    test_ids  = sample[-n_test:]
    test_ids = np.setdiff1d(test_ids, train_ids)
    if(len(test_ids) == 0 ):
        continue

    X_train,y_train = X[train_ids], y[train_ids]
    X_test, y_test = X[test_ids], y[test_ids]
    
    estimator.fit(X_train, y_train)
    y_predict = estimator.predict(X_test)
    y_predicts[i,test_ids] = y_predict



In [8]:

    
y_bias = (y - np.mean(y_predicts, axis=0)) **2

y_error = ((y - y_predicts) **2).mean()
y_var = np.var(y_predicts, axis=0, ddof = 1)


print np.mean(y_bias) +  np.mean(y_var)

clf_type = "Decision tree"
print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) "
          "+ {3:.4f} (var)".format(clf_type,
                                                      np.mean(y_error),
                                                      np.mean(y_bias),
                                                      np.mean(y_var)))

print("{0}: {1:.4f} ((bias^2) + (var)) = {2:.4f} (bias^2) "
          "+ {3:.4f} (var)".format(clf_type,
                                                      np.mean(y_bias) + np.mean(y_var),
                                                      np.mean(y_bias),
                                                      np.mean(y_var)))









    



0.01071066406
Decision tree: 0.0107 (error) = 0.0100 (bias^2) + 0.0008 (var)
Decision tree: 0.0107 ((bias^2) + (var)) = 0.0100 (bias^2) + 0.0008 (var)



In [9]:

    
clf = RandomForestRegressor(n_estimators = 1000,max_depth = 2)
from sklearn.linear_model import SGDRegressor, BayesianRidge

clf = BayesianRidge(normalize = True)


dummy_clf = DummyRegressor()
scores = cross_val_score(clf, X, y, cv=10,scoring = make_scorer(mse))
dummy_scores = cross_val_score(dummy_clf, X, y, cv=10, scoring = make_scorer(mse))

print("MSE: %0.8f (+/- %0.8f)" % (scores.mean(), scores.std()))
print("Dummy MSE: %0.8f (+/- %0.8f)" % (dummy_scores.mean(), dummy_scores.std()))

#print clf









    



MSE: 0.00479575 (+/- 0.00511744)
Dummy MSE: 0.00616020 (+/- 0.00521769)

Now let's train the regressor on the whole dataset



In [10]:

    
#clf = RandomForestRegressor(n_estimators = 500, criterion = "mse")
#clf = DecisionTreeRegressor()
# from sklearn.ensemble import BaggingRegressor
# from sklearn.ensemble import AdaBoostRegressor
clf =BayesianRidge()
print X.shape

# clf = AdaBoostRegressor(DecisionTreeRegressor(),n_estimators = 1000)#

print X.shape, y.shape
stdy = y
clf.fit(X,stdy)

print mse(stdy,clf.predict(X))









    



(495, 13)
(495, 13) (495,)
0.00442525939709

	Category	Page total likes	Type	Post Month	Post Hour	Post Weekday	Paid
0	2	139441	Photo	12	3	4	0.0
1	2	139441	Status	12	10	3	0.0
2	3	139441	Photo	12	3	3	0.0
3	2	139441	Photo	12	10	2	1.0
4	2	139441	Photo	12	3	2	0.0

	Lifetime Post Total Reach	Lifetime Post Total Impressions	Lifetime Engaged Users	Lifetime Post Consumers	Lifetime Post Consumptions	Lifetime Post Impressions by people who have liked your Page	Lifetime Post reach by people who like your Page	Lifetime People who have liked your Page and engaged with your post	comment	like	share	Total Interactions
0	2752	5091	178	109	159	3078	1640	119	4	79.0	17.0	100
1	10460	19057	1457	1361	1674	11710	6112	1108	5	130.0	29.0	164
2	2413	4373	177	113	154	2812	1503	132	0	66.0	14.0	80
3	50128	87991	2211	790	1119	61027	32048	1386	58	1572.0	147.0	1777
4	7244	13594	671	410	580	6228	3200	396	19	325.0	49.0	393